OLD | NEW |
(Empty) | |
| 1 /* libFLAC - Free Lossless Audio Codec library |
| 2 * Copyright (C) 2000-2009 Josh Coalson |
| 3 * Copyright (C) 2011-2014 Xiph.Org Foundation |
| 4 * |
| 5 * Redistribution and use in source and binary forms, with or without |
| 6 * modification, are permitted provided that the following conditions |
| 7 * are met: |
| 8 * |
| 9 * - Redistributions of source code must retain the above copyright |
| 10 * notice, this list of conditions and the following disclaimer. |
| 11 * |
| 12 * - Redistributions in binary form must reproduce the above copyright |
| 13 * notice, this list of conditions and the following disclaimer in the |
| 14 * documentation and/or other materials provided with the distribution. |
| 15 * |
| 16 * - Neither the name of the Xiph.org Foundation nor the names of its |
| 17 * contributors may be used to endorse or promote products derived from |
| 18 * this software without specific prior written permission. |
| 19 * |
| 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR |
| 24 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| 25 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| 26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| 27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
| 28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
| 29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| 30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 31 */ |
| 32 |
| 33 #ifdef HAVE_CONFIG_H |
| 34 # include <config.h> |
| 35 #endif |
| 36 |
| 37 #ifndef FLAC__INTEGER_ONLY_LIBRARY |
| 38 #ifndef FLAC__NO_ASM |
| 39 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X8
6INTRIN |
| 40 #include "private/lpc.h" |
| 41 #ifdef FLAC__SSE4_1_SUPPORTED |
| 42 |
| 43 #include "FLAC/assert.h" |
| 44 #include "FLAC/format.h" |
| 45 |
| 46 #include <smmintrin.h> /* SSE4.1 */ |
| 47 |
| 48 #if defined FLAC__CPU_IA32 /* unused for x64 */ |
| 49 |
| 50 #define RESIDUAL64_RESULT(xmmN) residual[i] = data[i] - _mm_cvtsi128_si32(_mm_s
rl_epi64(xmmN, cnt)) |
| 51 #define RESIDUAL64_RESULT1(xmmN) residual[i] = data[i] - _mm_cvtsi128_si32(_mm_s
rli_epi64(xmmN, lp_quantization)) |
| 52 |
| 53 FLAC__SSE_TARGET("sse4.1") |
| 54 void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
AC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned orde
r, int lp_quantization, FLAC__int32 residual[]) |
| 55 { |
| 56 int i; |
| 57 __m128i cnt = _mm_cvtsi32_si128(lp_quantization); |
| 58 |
| 59 FLAC__ASSERT(order > 0); |
| 60 FLAC__ASSERT(order <= 32); |
| 61 FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm_sra_epi64() so we
have to use _mm_srl_epi64() */ |
| 62 |
| 63 if(order <= 12) { |
| 64 if(order > 8) { /* order == 9, 10, 11, 12 */ |
| 65 if(order > 10) { /* order == 11, 12 */ |
| 66 if(order == 12) { |
| 67 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xm
m5, xmm6, xmm7; |
| 68 xmm0 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+0)); // 0 0 q[1] q[0] |
| 69 xmm1 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+2)); // 0 0 q[3] q[2] |
| 70 xmm2 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+4)); // 0 0 q[5] q[4] |
| 71 xmm3 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+6)); // 0 0 q[7] q[6] |
| 72 xmm4 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+8)); // 0 0 q[9] q[8] |
| 73 xmm5 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+10)); // 0 0 q[11] q[10] |
| 74 |
| 75 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF
LE(3,1,2,0)); // 0 q[1] 0 q[0] |
| 76 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFF
LE(3,1,2,0)); // 0 q[3] 0 q[2] |
| 77 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFF
LE(3,1,2,0)); // 0 q[5] 0 q[4] |
| 78 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFF
LE(3,1,2,0)); // 0 q[7] 0 q[6] |
| 79 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFF
LE(3,1,2,0)); // 0 q[9] 0 q[8] |
| 80 xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFF
LE(3,1,2,0)); // 0 q[11] 0 q[10] |
| 81 |
| 82 for(i = 0; i < (int)data_len; i++) { |
| 83 //sum = 0; |
| 84 //sum += qlp_coeff[11] * (FLAC__
int64)data[i-12]; |
| 85 //sum += qlp_coeff[10] * (FLAC__
int64)data[i-11]; |
| 86 xmm7 = _mm_loadl_epi64((const __
m128i*)(data+i-12)); // 0 0 d[i-11] d[i-12] |
| 87 xmm7 = _mm_shuffle_epi32(xmm7, _
MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11] |
| 88 xmm7 = _mm_mul_epi32(xmm7, xmm5)
; |
| 89 |
| 90 //sum += qlp_coeff[9] * (FLAC__i
nt64)data[i-10]; |
| 91 //sum += qlp_coeff[8] * (FLAC__i
nt64)data[i-9]; |
| 92 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-10)); |
| 93 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 94 xmm6 = _mm_mul_epi32(xmm6, xmm4)
; |
| 95 xmm7 = _mm_add_epi64(xmm7, xmm6)
; |
| 96 |
| 97 //sum += qlp_coeff[7] * (FLAC__i
nt64)data[i-8]; |
| 98 //sum += qlp_coeff[6] * (FLAC__i
nt64)data[i-7]; |
| 99 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-8)); |
| 100 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 101 xmm6 = _mm_mul_epi32(xmm6, xmm3)
; |
| 102 xmm7 = _mm_add_epi64(xmm7, xmm6)
; |
| 103 |
| 104 //sum += qlp_coeff[5] * (FLAC__i
nt64)data[i-6]; |
| 105 //sum += qlp_coeff[4] * (FLAC__i
nt64)data[i-5]; |
| 106 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-6)); |
| 107 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 108 xmm6 = _mm_mul_epi32(xmm6, xmm2)
; |
| 109 xmm7 = _mm_add_epi64(xmm7, xmm6)
; |
| 110 |
| 111 //sum += qlp_coeff[3] * (FLAC__i
nt64)data[i-4]; |
| 112 //sum += qlp_coeff[2] * (FLAC__i
nt64)data[i-3]; |
| 113 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-4)); |
| 114 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 115 xmm6 = _mm_mul_epi32(xmm6, xmm1)
; |
| 116 xmm7 = _mm_add_epi64(xmm7, xmm6)
; |
| 117 |
| 118 //sum += qlp_coeff[1] * (FLAC__i
nt64)data[i-2]; |
| 119 //sum += qlp_coeff[0] * (FLAC__i
nt64)data[i-1]; |
| 120 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-2)); |
| 121 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 122 xmm6 = _mm_mul_epi32(xmm6, xmm0)
; |
| 123 xmm7 = _mm_add_epi64(xmm7, xmm6)
; |
| 124 |
| 125 xmm7 = _mm_add_epi64(xmm7, _mm_s
rli_si128(xmm7, 8)); |
| 126 RESIDUAL64_RESULT1(xmm7); |
| 127 } |
| 128 } |
| 129 else { /* order == 11 */ |
| 130 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xm
m5, xmm6, xmm7; |
| 131 xmm0 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+0)); |
| 132 xmm1 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+2)); |
| 133 xmm2 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+4)); |
| 134 xmm3 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+6)); |
| 135 xmm4 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+8)); |
| 136 xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]); |
| 137 |
| 138 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF
LE(3,1,2,0)); |
| 139 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFF
LE(3,1,2,0)); |
| 140 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFF
LE(3,1,2,0)); |
| 141 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFF
LE(3,1,2,0)); |
| 142 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFF
LE(3,1,2,0)); |
| 143 |
| 144 for(i = 0; i < (int)data_len; i++) { |
| 145 //sum = 0; |
| 146 //sum = qlp_coeff[10] * (FLAC__
int64)data[i-11]; |
| 147 xmm7 = _mm_cvtsi32_si128(data[i-
11]); |
| 148 xmm7 = _mm_mul_epi32(xmm7, xmm5)
; |
| 149 |
| 150 //sum += qlp_coeff[9] * (FLAC__i
nt64)data[i-10]; |
| 151 //sum += qlp_coeff[8] * (FLAC__i
nt64)data[i-9]; |
| 152 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-10)); |
| 153 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 154 xmm6 = _mm_mul_epi32(xmm6, xmm4)
; |
| 155 xmm7 = _mm_add_epi64(xmm7, xmm6)
; |
| 156 |
| 157 //sum += qlp_coeff[7] * (FLAC__i
nt64)data[i-8]; |
| 158 //sum += qlp_coeff[6] * (FLAC__i
nt64)data[i-7]; |
| 159 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-8)); |
| 160 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 161 xmm6 = _mm_mul_epi32(xmm6, xmm3)
; |
| 162 xmm7 = _mm_add_epi64(xmm7, xmm6)
; |
| 163 |
| 164 //sum += qlp_coeff[5] * (FLAC__i
nt64)data[i-6]; |
| 165 //sum += qlp_coeff[4] * (FLAC__i
nt64)data[i-5]; |
| 166 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-6)); |
| 167 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 168 xmm6 = _mm_mul_epi32(xmm6, xmm2)
; |
| 169 xmm7 = _mm_add_epi64(xmm7, xmm6)
; |
| 170 |
| 171 //sum += qlp_coeff[3] * (FLAC__i
nt64)data[i-4]; |
| 172 //sum += qlp_coeff[2] * (FLAC__i
nt64)data[i-3]; |
| 173 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-4)); |
| 174 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 175 xmm6 = _mm_mul_epi32(xmm6, xmm1)
; |
| 176 xmm7 = _mm_add_epi64(xmm7, xmm6)
; |
| 177 |
| 178 //sum += qlp_coeff[1] * (FLAC__i
nt64)data[i-2]; |
| 179 //sum += qlp_coeff[0] * (FLAC__i
nt64)data[i-1]; |
| 180 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-2)); |
| 181 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 182 xmm6 = _mm_mul_epi32(xmm6, xmm0)
; |
| 183 xmm7 = _mm_add_epi64(xmm7, xmm6)
; |
| 184 |
| 185 xmm7 = _mm_add_epi64(xmm7, _mm_s
rli_si128(xmm7, 8)); |
| 186 RESIDUAL64_RESULT1(xmm7); |
| 187 } |
| 188 } |
| 189 } |
| 190 else { /* order == 9, 10 */ |
| 191 if(order == 10) { |
| 192 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xm
m6, xmm7; |
| 193 xmm0 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+0)); |
| 194 xmm1 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+2)); |
| 195 xmm2 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+4)); |
| 196 xmm3 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+6)); |
| 197 xmm4 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+8)); |
| 198 |
| 199 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF
LE(3,1,2,0)); |
| 200 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFF
LE(3,1,2,0)); |
| 201 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFF
LE(3,1,2,0)); |
| 202 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFF
LE(3,1,2,0)); |
| 203 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFF
LE(3,1,2,0)); |
| 204 |
| 205 for(i = 0; i < (int)data_len; i++) { |
| 206 //sum = 0; |
| 207 //sum += qlp_coeff[9] * (FLAC__i
nt64)data[i-10]; |
| 208 //sum += qlp_coeff[8] * (FLAC__i
nt64)data[i-9]; |
| 209 xmm7 = _mm_loadl_epi64((const __
m128i*)(data+i-10)); |
| 210 xmm7 = _mm_shuffle_epi32(xmm7, _
MM_SHUFFLE(2,0,3,1)); |
| 211 xmm7 = _mm_mul_epi32(xmm7, xmm4)
; |
| 212 |
| 213 //sum += qlp_coeff[7] * (FLAC__i
nt64)data[i-8]; |
| 214 //sum += qlp_coeff[6] * (FLAC__i
nt64)data[i-7]; |
| 215 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-8)); |
| 216 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 217 xmm6 = _mm_mul_epi32(xmm6, xmm3)
; |
| 218 xmm7 = _mm_add_epi64(xmm7, xmm6)
; |
| 219 |
| 220 //sum += qlp_coeff[5] * (FLAC__i
nt64)data[i-6]; |
| 221 //sum += qlp_coeff[4] * (FLAC__i
nt64)data[i-5]; |
| 222 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-6)); |
| 223 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 224 xmm6 = _mm_mul_epi32(xmm6, xmm2)
; |
| 225 xmm7 = _mm_add_epi64(xmm7, xmm6)
; |
| 226 |
| 227 //sum += qlp_coeff[3] * (FLAC__i
nt64)data[i-4]; |
| 228 //sum += qlp_coeff[2] * (FLAC__i
nt64)data[i-3]; |
| 229 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-4)); |
| 230 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 231 xmm6 = _mm_mul_epi32(xmm6, xmm1)
; |
| 232 xmm7 = _mm_add_epi64(xmm7, xmm6)
; |
| 233 |
| 234 //sum += qlp_coeff[1] * (FLAC__i
nt64)data[i-2]; |
| 235 //sum += qlp_coeff[0] * (FLAC__i
nt64)data[i-1]; |
| 236 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-2)); |
| 237 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 238 xmm6 = _mm_mul_epi32(xmm6, xmm0)
; |
| 239 xmm7 = _mm_add_epi64(xmm7, xmm6)
; |
| 240 |
| 241 xmm7 = _mm_add_epi64(xmm7, _mm_s
rli_si128(xmm7, 8)); |
| 242 RESIDUAL64_RESULT(xmm7); |
| 243 } |
| 244 } |
| 245 else { /* order == 9 */ |
| 246 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xm
m6, xmm7; |
| 247 xmm0 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+0)); |
| 248 xmm1 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+2)); |
| 249 xmm2 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+4)); |
| 250 xmm3 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+6)); |
| 251 xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]); |
| 252 |
| 253 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF
LE(3,1,2,0)); |
| 254 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFF
LE(3,1,2,0)); |
| 255 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFF
LE(3,1,2,0)); |
| 256 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFF
LE(3,1,2,0)); |
| 257 |
| 258 for(i = 0; i < (int)data_len; i++) { |
| 259 //sum = 0; |
| 260 //sum = qlp_coeff[8] * (FLAC__i
nt64)data[i-9]; |
| 261 xmm7 = _mm_cvtsi32_si128(data[i-
9]); |
| 262 xmm7 = _mm_mul_epi32(xmm7, xmm4)
; |
| 263 |
| 264 //sum += qlp_coeff[7] * (FLAC__i
nt64)data[i-8]; |
| 265 //sum += qlp_coeff[6] * (FLAC__i
nt64)data[i-7]; |
| 266 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-8)); |
| 267 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 268 xmm6 = _mm_mul_epi32(xmm6, xmm3)
; |
| 269 xmm7 = _mm_add_epi64(xmm7, xmm6)
; |
| 270 |
| 271 //sum += qlp_coeff[5] * (FLAC__i
nt64)data[i-6]; |
| 272 //sum += qlp_coeff[4] * (FLAC__i
nt64)data[i-5]; |
| 273 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-6)); |
| 274 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 275 xmm6 = _mm_mul_epi32(xmm6, xmm2)
; |
| 276 xmm7 = _mm_add_epi64(xmm7, xmm6)
; |
| 277 |
| 278 //sum += qlp_coeff[3] * (FLAC__i
nt64)data[i-4]; |
| 279 //sum += qlp_coeff[2] * (FLAC__i
nt64)data[i-3]; |
| 280 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-4)); |
| 281 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 282 xmm6 = _mm_mul_epi32(xmm6, xmm1)
; |
| 283 xmm7 = _mm_add_epi64(xmm7, xmm6)
; |
| 284 |
| 285 //sum += qlp_coeff[1] * (FLAC__i
nt64)data[i-2]; |
| 286 //sum += qlp_coeff[0] * (FLAC__i
nt64)data[i-1]; |
| 287 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-2)); |
| 288 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 289 xmm6 = _mm_mul_epi32(xmm6, xmm0)
; |
| 290 xmm7 = _mm_add_epi64(xmm7, xmm6)
; |
| 291 |
| 292 xmm7 = _mm_add_epi64(xmm7, _mm_s
rli_si128(xmm7, 8)); |
| 293 RESIDUAL64_RESULT(xmm7); |
| 294 } |
| 295 } |
| 296 } |
| 297 } |
| 298 else if(order > 4) { /* order == 5, 6, 7, 8 */ |
| 299 if(order > 6) { /* order == 7, 8 */ |
| 300 if(order == 8) { |
| 301 __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xm
m7; |
| 302 xmm0 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+0)); |
| 303 xmm1 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+2)); |
| 304 xmm2 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+4)); |
| 305 xmm3 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+6)); |
| 306 |
| 307 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF
LE(3,1,2,0)); |
| 308 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFF
LE(3,1,2,0)); |
| 309 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFF
LE(3,1,2,0)); |
| 310 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFF
LE(3,1,2,0)); |
| 311 |
| 312 for(i = 0; i < (int)data_len; i++) { |
| 313 //sum = 0; |
| 314 //sum += qlp_coeff[7] * (FLAC__i
nt64)data[i-8]; |
| 315 //sum += qlp_coeff[6] * (FLAC__i
nt64)data[i-7]; |
| 316 xmm7 = _mm_loadl_epi64((const __
m128i*)(data+i-8)); |
| 317 xmm7 = _mm_shuffle_epi32(xmm7, _
MM_SHUFFLE(2,0,3,1)); |
| 318 xmm7 = _mm_mul_epi32(xmm7, xmm3)
; |
| 319 |
| 320 //sum += qlp_coeff[5] * (FLAC__i
nt64)data[i-6]; |
| 321 //sum += qlp_coeff[4] * (FLAC__i
nt64)data[i-5]; |
| 322 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-6)); |
| 323 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 324 xmm6 = _mm_mul_epi32(xmm6, xmm2)
; |
| 325 xmm7 = _mm_add_epi64(xmm7, xmm6)
; |
| 326 |
| 327 //sum += qlp_coeff[3] * (FLAC__i
nt64)data[i-4]; |
| 328 //sum += qlp_coeff[2] * (FLAC__i
nt64)data[i-3]; |
| 329 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-4)); |
| 330 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 331 xmm6 = _mm_mul_epi32(xmm6, xmm1)
; |
| 332 xmm7 = _mm_add_epi64(xmm7, xmm6)
; |
| 333 |
| 334 //sum += qlp_coeff[1] * (FLAC__i
nt64)data[i-2]; |
| 335 //sum += qlp_coeff[0] * (FLAC__i
nt64)data[i-1]; |
| 336 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-2)); |
| 337 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 338 xmm6 = _mm_mul_epi32(xmm6, xmm0)
; |
| 339 xmm7 = _mm_add_epi64(xmm7, xmm6)
; |
| 340 |
| 341 xmm7 = _mm_add_epi64(xmm7, _mm_s
rli_si128(xmm7, 8)); |
| 342 RESIDUAL64_RESULT(xmm7); |
| 343 } |
| 344 } |
| 345 else { /* order == 7 */ |
| 346 __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xm
m7; |
| 347 xmm0 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+0)); |
| 348 xmm1 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+2)); |
| 349 xmm2 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+4)); |
| 350 xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]); |
| 351 |
| 352 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF
LE(3,1,2,0)); |
| 353 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFF
LE(3,1,2,0)); |
| 354 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFF
LE(3,1,2,0)); |
| 355 |
| 356 for(i = 0; i < (int)data_len; i++) { |
| 357 //sum = 0; |
| 358 //sum = qlp_coeff[6] * (FLAC__i
nt64)data[i-7]; |
| 359 xmm7 = _mm_cvtsi32_si128(data[i-
7]); |
| 360 xmm7 = _mm_mul_epi32(xmm7, xmm3)
; |
| 361 |
| 362 //sum += qlp_coeff[5] * (FLAC__i
nt64)data[i-6]; |
| 363 //sum += qlp_coeff[4] * (FLAC__i
nt64)data[i-5]; |
| 364 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-6)); |
| 365 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 366 xmm6 = _mm_mul_epi32(xmm6, xmm2)
; |
| 367 xmm7 = _mm_add_epi64(xmm7, xmm6)
; |
| 368 |
| 369 //sum += qlp_coeff[3] * (FLAC__i
nt64)data[i-4]; |
| 370 //sum += qlp_coeff[2] * (FLAC__i
nt64)data[i-3]; |
| 371 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-4)); |
| 372 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 373 xmm6 = _mm_mul_epi32(xmm6, xmm1)
; |
| 374 xmm7 = _mm_add_epi64(xmm7, xmm6)
; |
| 375 |
| 376 //sum += qlp_coeff[1] * (FLAC__i
nt64)data[i-2]; |
| 377 //sum += qlp_coeff[0] * (FLAC__i
nt64)data[i-1]; |
| 378 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-2)); |
| 379 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 380 xmm6 = _mm_mul_epi32(xmm6, xmm0)
; |
| 381 xmm7 = _mm_add_epi64(xmm7, xmm6)
; |
| 382 |
| 383 xmm7 = _mm_add_epi64(xmm7, _mm_s
rli_si128(xmm7, 8)); |
| 384 RESIDUAL64_RESULT(xmm7); |
| 385 } |
| 386 } |
| 387 } |
| 388 else { /* order == 5, 6 */ |
| 389 if(order == 6) { |
| 390 __m128i xmm0, xmm1, xmm2, xmm6, xmm7; |
| 391 xmm0 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+0)); |
| 392 xmm1 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+2)); |
| 393 xmm2 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+4)); |
| 394 |
| 395 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF
LE(3,1,2,0)); |
| 396 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFF
LE(3,1,2,0)); |
| 397 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFF
LE(3,1,2,0)); |
| 398 |
| 399 for(i = 0; i < (int)data_len; i++) { |
| 400 //sum = 0; |
| 401 //sum += qlp_coeff[5] * (FLAC__i
nt64)data[i-6]; |
| 402 //sum += qlp_coeff[4] * (FLAC__i
nt64)data[i-5]; |
| 403 xmm7 = _mm_loadl_epi64((const __
m128i*)(data+i-6)); |
| 404 xmm7 = _mm_shuffle_epi32(xmm7, _
MM_SHUFFLE(2,0,3,1)); |
| 405 xmm7 = _mm_mul_epi32(xmm7, xmm2)
; |
| 406 |
| 407 //sum += qlp_coeff[3] * (FLAC__i
nt64)data[i-4]; |
| 408 //sum += qlp_coeff[2] * (FLAC__i
nt64)data[i-3]; |
| 409 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-4)); |
| 410 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 411 xmm6 = _mm_mul_epi32(xmm6, xmm1)
; |
| 412 xmm7 = _mm_add_epi64(xmm7, xmm6)
; |
| 413 |
| 414 //sum += qlp_coeff[1] * (FLAC__i
nt64)data[i-2]; |
| 415 //sum += qlp_coeff[0] * (FLAC__i
nt64)data[i-1]; |
| 416 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-2)); |
| 417 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 418 xmm6 = _mm_mul_epi32(xmm6, xmm0)
; |
| 419 xmm7 = _mm_add_epi64(xmm7, xmm6)
; |
| 420 |
| 421 xmm7 = _mm_add_epi64(xmm7, _mm_s
rli_si128(xmm7, 8)); |
| 422 RESIDUAL64_RESULT(xmm7); |
| 423 } |
| 424 } |
| 425 else { /* order == 5 */ |
| 426 __m128i xmm0, xmm1, xmm2, xmm6, xmm7; |
| 427 xmm0 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+0)); |
| 428 xmm1 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+2)); |
| 429 xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]); |
| 430 |
| 431 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF
LE(3,1,2,0)); |
| 432 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFF
LE(3,1,2,0)); |
| 433 |
| 434 for(i = 0; i < (int)data_len; i++) { |
| 435 //sum = 0; |
| 436 //sum = qlp_coeff[4] * (FLAC__i
nt64)data[i-5]; |
| 437 xmm7 = _mm_cvtsi32_si128(data[i-
5]); |
| 438 xmm7 = _mm_mul_epi32(xmm7, xmm2)
; |
| 439 |
| 440 //sum += qlp_coeff[3] * (FLAC__i
nt64)data[i-4]; |
| 441 //sum += qlp_coeff[2] * (FLAC__i
nt64)data[i-3]; |
| 442 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-4)); |
| 443 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 444 xmm6 = _mm_mul_epi32(xmm6, xmm1)
; |
| 445 xmm7 = _mm_add_epi64(xmm7, xmm6)
; |
| 446 |
| 447 //sum += qlp_coeff[1] * (FLAC__i
nt64)data[i-2]; |
| 448 //sum += qlp_coeff[0] * (FLAC__i
nt64)data[i-1]; |
| 449 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-2)); |
| 450 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 451 xmm6 = _mm_mul_epi32(xmm6, xmm0)
; |
| 452 xmm7 = _mm_add_epi64(xmm7, xmm6)
; |
| 453 |
| 454 xmm7 = _mm_add_epi64(xmm7, _mm_s
rli_si128(xmm7, 8)); |
| 455 RESIDUAL64_RESULT(xmm7); |
| 456 } |
| 457 } |
| 458 } |
| 459 } |
| 460 else { /* order == 1, 2, 3, 4 */ |
| 461 if(order > 2) { /* order == 3, 4 */ |
| 462 if(order == 4) { |
| 463 __m128i xmm0, xmm1, xmm6, xmm7; |
| 464 xmm0 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+0)); |
| 465 xmm1 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+2)); |
| 466 |
| 467 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF
LE(3,1,2,0)); |
| 468 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFF
LE(3,1,2,0)); |
| 469 |
| 470 for(i = 0; i < (int)data_len; i++) { |
| 471 //sum = 0; |
| 472 //sum += qlp_coeff[3] * (FLAC__i
nt64)data[i-4]; |
| 473 //sum += qlp_coeff[2] * (FLAC__i
nt64)data[i-3]; |
| 474 xmm7 = _mm_loadl_epi64((const __
m128i*)(data+i-4)); |
| 475 xmm7 = _mm_shuffle_epi32(xmm7, _
MM_SHUFFLE(2,0,3,1)); |
| 476 xmm7 = _mm_mul_epi32(xmm7, xmm1)
; |
| 477 |
| 478 //sum += qlp_coeff[1] * (FLAC__i
nt64)data[i-2]; |
| 479 //sum += qlp_coeff[0] * (FLAC__i
nt64)data[i-1]; |
| 480 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-2)); |
| 481 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 482 xmm6 = _mm_mul_epi32(xmm6, xmm0)
; |
| 483 xmm7 = _mm_add_epi64(xmm7, xmm6)
; |
| 484 |
| 485 xmm7 = _mm_add_epi64(xmm7, _mm_s
rli_si128(xmm7, 8)); |
| 486 RESIDUAL64_RESULT(xmm7); |
| 487 } |
| 488 } |
| 489 else { /* order == 3 */ |
| 490 __m128i xmm0, xmm1, xmm6, xmm7; |
| 491 xmm0 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+0)); |
| 492 xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]); |
| 493 |
| 494 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF
LE(3,1,2,0)); |
| 495 |
| 496 for(i = 0; i < (int)data_len; i++) { |
| 497 //sum = 0; |
| 498 //sum = qlp_coeff[2] * (FLAC__i
nt64)data[i-3]; |
| 499 xmm7 = _mm_cvtsi32_si128(data[i-
3]); |
| 500 xmm7 = _mm_mul_epi32(xmm7, xmm1)
; |
| 501 |
| 502 //sum += qlp_coeff[1] * (FLAC__i
nt64)data[i-2]; |
| 503 //sum += qlp_coeff[0] * (FLAC__i
nt64)data[i-1]; |
| 504 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-2)); |
| 505 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 506 xmm6 = _mm_mul_epi32(xmm6, xmm0)
; |
| 507 xmm7 = _mm_add_epi64(xmm7, xmm6)
; |
| 508 |
| 509 xmm7 = _mm_add_epi64(xmm7, _mm_s
rli_si128(xmm7, 8)); |
| 510 RESIDUAL64_RESULT(xmm7); |
| 511 } |
| 512 } |
| 513 } |
| 514 else { /* order == 1, 2 */ |
| 515 if(order == 2) { |
| 516 __m128i xmm0, xmm7; |
| 517 xmm0 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+0)); |
| 518 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF
LE(3,1,2,0)); |
| 519 |
| 520 for(i = 0; i < (int)data_len; i++) { |
| 521 //sum = 0; |
| 522 //sum += qlp_coeff[1] * (FLAC__i
nt64)data[i-2]; |
| 523 //sum += qlp_coeff[0] * (FLAC__i
nt64)data[i-1]; |
| 524 xmm7 = _mm_loadl_epi64((const __
m128i*)(data+i-2)); |
| 525 xmm7 = _mm_shuffle_epi32(xmm7, _
MM_SHUFFLE(2,0,3,1)); |
| 526 xmm7 = _mm_mul_epi32(xmm7, xmm0)
; |
| 527 |
| 528 xmm7 = _mm_add_epi64(xmm7, _mm_s
rli_si128(xmm7, 8)); |
| 529 RESIDUAL64_RESULT(xmm7); |
| 530 } |
| 531 } |
| 532 else { /* order == 1 */ |
| 533 __m128i xmm0, xmm7; |
| 534 xmm0 = _mm_cvtsi32_si128(qlp_coeff[0]); |
| 535 |
| 536 for(i = 0; i < (int)data_len; i++) { |
| 537 //sum = qlp_coeff[0] * (FLAC__in
t64)data[i-1]; |
| 538 xmm7 = _mm_cvtsi32_si128(data[i-
1]); |
| 539 xmm7 = _mm_mul_epi32(xmm7, xmm0)
; |
| 540 RESIDUAL64_RESULT(xmm7); |
| 541 } |
| 542 } |
| 543 } |
| 544 } |
| 545 } |
| 546 else { /* order > 12 */ |
| 547 FLAC__int64 sum; |
| 548 for(i = 0; i < (int)data_len; i++) { |
| 549 sum = 0; |
| 550 switch(order) { |
| 551 case 32: sum += qlp_coeff[31] * (FLAC__int64)dat
a[i-32]; |
| 552 case 31: sum += qlp_coeff[30] * (FLAC__int64)dat
a[i-31]; |
| 553 case 30: sum += qlp_coeff[29] * (FLAC__int64)dat
a[i-30]; |
| 554 case 29: sum += qlp_coeff[28] * (FLAC__int64)dat
a[i-29]; |
| 555 case 28: sum += qlp_coeff[27] * (FLAC__int64)dat
a[i-28]; |
| 556 case 27: sum += qlp_coeff[26] * (FLAC__int64)dat
a[i-27]; |
| 557 case 26: sum += qlp_coeff[25] * (FLAC__int64)dat
a[i-26]; |
| 558 case 25: sum += qlp_coeff[24] * (FLAC__int64)dat
a[i-25]; |
| 559 case 24: sum += qlp_coeff[23] * (FLAC__int64)dat
a[i-24]; |
| 560 case 23: sum += qlp_coeff[22] * (FLAC__int64)dat
a[i-23]; |
| 561 case 22: sum += qlp_coeff[21] * (FLAC__int64)dat
a[i-22]; |
| 562 case 21: sum += qlp_coeff[20] * (FLAC__int64)dat
a[i-21]; |
| 563 case 20: sum += qlp_coeff[19] * (FLAC__int64)dat
a[i-20]; |
| 564 case 19: sum += qlp_coeff[18] * (FLAC__int64)dat
a[i-19]; |
| 565 case 18: sum += qlp_coeff[17] * (FLAC__int64)dat
a[i-18]; |
| 566 case 17: sum += qlp_coeff[16] * (FLAC__int64)dat
a[i-17]; |
| 567 case 16: sum += qlp_coeff[15] * (FLAC__int64)dat
a[i-16]; |
| 568 case 15: sum += qlp_coeff[14] * (FLAC__int64)dat
a[i-15]; |
| 569 case 14: sum += qlp_coeff[13] * (FLAC__int64)dat
a[i-14]; |
| 570 case 13: sum += qlp_coeff[12] * (FLAC__int64)dat
a[i-13]; |
| 571 sum += qlp_coeff[11] * (FLAC__int64)dat
a[i-12]; |
| 572 sum += qlp_coeff[10] * (FLAC__int64)dat
a[i-11]; |
| 573 sum += qlp_coeff[ 9] * (FLAC__int64)dat
a[i-10]; |
| 574 sum += qlp_coeff[ 8] * (FLAC__int64)dat
a[i- 9]; |
| 575 sum += qlp_coeff[ 7] * (FLAC__int64)dat
a[i- 8]; |
| 576 sum += qlp_coeff[ 6] * (FLAC__int64)dat
a[i- 7]; |
| 577 sum += qlp_coeff[ 5] * (FLAC__int64)dat
a[i- 6]; |
| 578 sum += qlp_coeff[ 4] * (FLAC__int64)dat
a[i- 5]; |
| 579 sum += qlp_coeff[ 3] * (FLAC__int64)dat
a[i- 4]; |
| 580 sum += qlp_coeff[ 2] * (FLAC__int64)dat
a[i- 3]; |
| 581 sum += qlp_coeff[ 1] * (FLAC__int64)dat
a[i- 2]; |
| 582 sum += qlp_coeff[ 0] * (FLAC__int64)dat
a[i- 1]; |
| 583 } |
| 584 residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantiza
tion); |
| 585 } |
| 586 } |
| 587 } |
| 588 |
| 589 FLAC__SSE_TARGET("sse4.1") |
| 590 void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], un
signed data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantizat
ion, FLAC__int32 data[]) |
| 591 { |
| 592 int i; |
| 593 __m128i cnt = _mm_cvtsi32_si128(lp_quantization); |
| 594 |
| 595 if (!data_len) |
| 596 return; |
| 597 |
| 598 FLAC__ASSERT(order > 0); |
| 599 FLAC__ASSERT(order <= 32); |
| 600 FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm_sra_epi64() so we
have to use _mm_srl_epi64() */ |
| 601 |
| 602 if(order <= 12) { |
| 603 if(order > 8) { /* order == 9, 10, 11, 12 */ |
| 604 if(order > 10) { /* order == 11, 12 */ |
| 605 __m128i qlp[6], dat[6]; |
| 606 __m128i summ, temp; |
| 607 qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_co
eff+0)); // 0 0 q[1] q[0] |
| 608 qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_co
eff+2)); // 0 0 q[3] q[2] |
| 609 qlp[2] = _mm_loadl_epi64((const __m128i*)(qlp_co
eff+4)); // 0 0 q[5] q[4] |
| 610 qlp[3] = _mm_loadl_epi64((const __m128i*)(qlp_co
eff+6)); // 0 0 q[7] q[6] |
| 611 qlp[4] = _mm_loadl_epi64((const __m128i*)(qlp_co
eff+8)); // 0 0 q[9] q[8] |
| 612 if (order == 12) |
| 613 qlp[5] = _mm_loadl_epi64((const __m128i*
)(qlp_coeff+10)); // 0 0 q[11] q[10] |
| 614 else |
| 615 qlp[5] = _mm_cvtsi32_si128(qlp_coeff[10]
); // 0 0 0 q[10] |
| 616 |
| 617 qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2
,0,3,1)); // 0 q[0] 0 q[1] |
| 618 qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2
,0,3,1)); // 0 q[2] 0 q[3] |
| 619 qlp[2] = _mm_shuffle_epi32(qlp[2], _MM_SHUFFLE(2
,0,3,1)); // 0 q[4] 0 q[5] |
| 620 qlp[3] = _mm_shuffle_epi32(qlp[3], _MM_SHUFFLE(2
,0,3,1)); // 0 q[5] 0 q[7] |
| 621 qlp[4] = _mm_shuffle_epi32(qlp[4], _MM_SHUFFLE(2
,0,3,1)); // 0 q[8] 0 q[9] |
| 622 qlp[5] = _mm_shuffle_epi32(qlp[5], _MM_SHUFFLE(2
,0,3,1)); // 0 q[10] 0 q[11] |
| 623 |
| 624 dat[5] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con
st __m128i*)(data-12))); // ? d[i-11] ? d[i-12] |
| 625 dat[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con
st __m128i*)(data-10))); // ? d[i-9] ? d[i-10] |
| 626 dat[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con
st __m128i*)(data-8 ))); // ? d[i-7] ? d[i-8] |
| 627 dat[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con
st __m128i*)(data-6 ))); // ? d[i-5] ? d[i-6] |
| 628 dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con
st __m128i*)(data-4 ))); // ? d[i-3] ? d[i-4] |
| 629 dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con
st __m128i*)(data-2 ))); // ? d[i-1] ? d[i-2] |
| 630 |
| 631 summ = _mm_mul_epi32(dat[5],
qlp[5]) ; |
| 632 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[4],
qlp[4])); |
| 633 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3],
qlp[3])); |
| 634 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2],
qlp[2])); |
| 635 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1],
qlp[1])); |
| 636 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0],
qlp[0])); |
| 637 |
| 638 summ = _mm_add_epi64(summ, _mm_srli_si128(summ,
8)); // ?_64 sum_64 |
| 639 summ = _mm_srl_epi64(summ, cnt);
// ?_64 (sum >> lp_quantization)_64 == ?_32
?_32 ?_32 (sum >> lp_quantization)_32 |
| 640 temp = _mm_cvtsi32_si128(residual[0]);
// 0 0 0 r[i] |
| 641 temp = _mm_add_epi32(temp, summ);
// ? ? ? d[i] |
| 642 data[0] = _mm_cvtsi128_si32(temp); |
| 643 |
| 644 for(i = 1; i < (int)data_len; i++) { |
| 645 dat[5] = _mm_alignr_epi8(dat[4], dat[5],
8); // ? d[i-10] ? d[i-11] |
| 646 dat[4] = _mm_alignr_epi8(dat[3], dat[4],
8); // ? d[i-8] ? d[i-9] |
| 647 dat[3] = _mm_alignr_epi8(dat[2], dat[3],
8); // ? d[i-6] ? d[i-7] |
| 648 dat[2] = _mm_alignr_epi8(dat[1], dat[2],
8); // ? d[i-4] ? d[i-5] |
| 649 dat[1] = _mm_alignr_epi8(dat[0], dat[1],
8); // ? d[i-2] ? d[i-3] |
| 650 dat[0] = _mm_alignr_epi8(temp, dat[0],
8); // ? d[i ] ? d[i-1] |
| 651 |
| 652 summ = _mm_mul_epi32
(dat[5], qlp[5]) ; |
| 653 summ = _mm_add_epi64(summ, _mm_mul_epi32
(dat[4], qlp[4])); |
| 654 summ = _mm_add_epi64(summ, _mm_mul_epi32
(dat[3], qlp[3])); |
| 655 summ = _mm_add_epi64(summ, _mm_mul_epi32
(dat[2], qlp[2])); |
| 656 summ = _mm_add_epi64(summ, _mm_mul_epi32
(dat[1], qlp[1])); |
| 657 summ = _mm_add_epi64(summ, _mm_mul_epi32
(dat[0], qlp[0])); |
| 658 |
| 659 summ = _mm_add_epi64(summ, _mm_srli_si12
8(summ, 8)); // ?_64 sum_64 |
| 660 summ = _mm_srl_epi64(summ, cnt);
// ?_64 (sum >> lp_quantization)_64 ==
?_32 ?_32 ?_32 (sum >> lp_quantization)_32 |
| 661 temp = _mm_cvtsi32_si128(residual[i]);
// 0 0 0 r[i] |
| 662 temp = _mm_add_epi32(temp, summ);
// ? ? ? d[i] |
| 663 data[i] = _mm_cvtsi128_si32(temp); |
| 664 } |
| 665 } |
| 666 else { /* order == 9, 10 */ |
| 667 __m128i qlp[5], dat[5]; |
| 668 __m128i summ, temp; |
| 669 qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_co
eff+0)); |
| 670 qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_co
eff+2)); |
| 671 qlp[2] = _mm_loadl_epi64((const __m128i*)(qlp_co
eff+4)); |
| 672 qlp[3] = _mm_loadl_epi64((const __m128i*)(qlp_co
eff+6)); |
| 673 if (order == 10) |
| 674 qlp[4] = _mm_loadl_epi64((const __m128i*
)(qlp_coeff+8)); |
| 675 else |
| 676 qlp[4] = _mm_cvtsi32_si128(qlp_coeff[8])
; |
| 677 |
| 678 qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2
,0,3,1)); |
| 679 qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2
,0,3,1)); |
| 680 qlp[2] = _mm_shuffle_epi32(qlp[2], _MM_SHUFFLE(2
,0,3,1)); |
| 681 qlp[3] = _mm_shuffle_epi32(qlp[3], _MM_SHUFFLE(2
,0,3,1)); |
| 682 qlp[4] = _mm_shuffle_epi32(qlp[4], _MM_SHUFFLE(2
,0,3,1)); |
| 683 |
| 684 dat[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con
st __m128i*)(data-10))); |
| 685 dat[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con
st __m128i*)(data-8 ))); |
| 686 dat[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con
st __m128i*)(data-6 ))); |
| 687 dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con
st __m128i*)(data-4 ))); |
| 688 dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con
st __m128i*)(data-2 ))); |
| 689 |
| 690 summ = _mm_mul_epi32(dat[4],
qlp[4]) ; |
| 691 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3],
qlp[3])); |
| 692 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2],
qlp[2])); |
| 693 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1],
qlp[1])); |
| 694 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0],
qlp[0])); |
| 695 |
| 696 summ = _mm_add_epi64(summ, _mm_srli_si128(summ,
8)); |
| 697 summ = _mm_srl_epi64(summ, cnt); |
| 698 temp = _mm_cvtsi32_si128(residual[0]); |
| 699 temp = _mm_add_epi32(temp, summ); |
| 700 data[0] = _mm_cvtsi128_si32(temp); |
| 701 |
| 702 for(i = 1; i < (int)data_len; i++) { |
| 703 dat[4] = _mm_alignr_epi8(dat[3], dat[4],
8); |
| 704 dat[3] = _mm_alignr_epi8(dat[2], dat[3],
8); |
| 705 dat[2] = _mm_alignr_epi8(dat[1], dat[2],
8); |
| 706 dat[1] = _mm_alignr_epi8(dat[0], dat[1],
8); |
| 707 dat[0] = _mm_alignr_epi8(temp, dat[0],
8); |
| 708 |
| 709 summ = _mm_mul_epi32
(dat[4], qlp[4]) ; |
| 710 summ = _mm_add_epi64(summ, _mm_mul_epi32
(dat[3], qlp[3])); |
| 711 summ = _mm_add_epi64(summ, _mm_mul_epi32
(dat[2], qlp[2])); |
| 712 summ = _mm_add_epi64(summ, _mm_mul_epi32
(dat[1], qlp[1])); |
| 713 summ = _mm_add_epi64(summ, _mm_mul_epi32
(dat[0], qlp[0])); |
| 714 |
| 715 summ = _mm_add_epi64(summ, _mm_srli_si12
8(summ, 8)); |
| 716 summ = _mm_srl_epi64(summ, cnt); |
| 717 temp = _mm_cvtsi32_si128(residual[i]); |
| 718 temp = _mm_add_epi32(temp, summ); |
| 719 data[i] = _mm_cvtsi128_si32(temp); |
| 720 } |
| 721 } |
| 722 } |
| 723 else if(order > 4) { /* order == 5, 6, 7, 8 */ |
| 724 if(order > 6) { /* order == 7, 8 */ |
| 725 __m128i qlp[4], dat[4]; |
| 726 __m128i summ, temp; |
| 727 qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_co
eff+0)); |
| 728 qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_co
eff+2)); |
| 729 qlp[2] = _mm_loadl_epi64((const __m128i*)(qlp_co
eff+4)); |
| 730 if (order == 8) |
| 731 qlp[3] = _mm_loadl_epi64((const __m128i*
)(qlp_coeff+6)); |
| 732 else |
| 733 qlp[3] = _mm_cvtsi32_si128(qlp_coeff[6])
; |
| 734 |
| 735 qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2
,0,3,1)); |
| 736 qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2
,0,3,1)); |
| 737 qlp[2] = _mm_shuffle_epi32(qlp[2], _MM_SHUFFLE(2
,0,3,1)); |
| 738 qlp[3] = _mm_shuffle_epi32(qlp[3], _MM_SHUFFLE(2
,0,3,1)); |
| 739 |
| 740 dat[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con
st __m128i*)(data-8 ))); |
| 741 dat[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con
st __m128i*)(data-6 ))); |
| 742 dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con
st __m128i*)(data-4 ))); |
| 743 dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con
st __m128i*)(data-2 ))); |
| 744 |
| 745 summ = _mm_mul_epi32(dat[3],
qlp[3]) ; |
| 746 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2],
qlp[2])); |
| 747 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1],
qlp[1])); |
| 748 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0],
qlp[0])); |
| 749 |
| 750 summ = _mm_add_epi64(summ, _mm_srli_si128(summ,
8)); |
| 751 summ = _mm_srl_epi64(summ, cnt); |
| 752 temp = _mm_cvtsi32_si128(residual[0]); |
| 753 temp = _mm_add_epi32(temp, summ); |
| 754 data[0] = _mm_cvtsi128_si32(temp); |
| 755 |
| 756 for(i = 1; i < (int)data_len; i++) { |
| 757 dat[3] = _mm_alignr_epi8(dat[2], dat[3],
8); |
| 758 dat[2] = _mm_alignr_epi8(dat[1], dat[2],
8); |
| 759 dat[1] = _mm_alignr_epi8(dat[0], dat[1],
8); |
| 760 dat[0] = _mm_alignr_epi8(temp, dat[0],
8); |
| 761 |
| 762 summ = _mm_mul_epi32
(dat[3], qlp[3]) ; |
| 763 summ = _mm_add_epi64(summ, _mm_mul_epi32
(dat[2], qlp[2])); |
| 764 summ = _mm_add_epi64(summ, _mm_mul_epi32
(dat[1], qlp[1])); |
| 765 summ = _mm_add_epi64(summ, _mm_mul_epi32
(dat[0], qlp[0])); |
| 766 |
| 767 summ = _mm_add_epi64(summ, _mm_srli_si12
8(summ, 8)); |
| 768 summ = _mm_srl_epi64(summ, cnt); |
| 769 temp = _mm_cvtsi32_si128(residual[i]); |
| 770 temp = _mm_add_epi32(temp, summ); |
| 771 data[i] = _mm_cvtsi128_si32(temp); |
| 772 } |
| 773 } |
| 774 else { /* order == 5, 6 */ |
| 775 __m128i qlp[3], dat[3]; |
| 776 __m128i summ, temp; |
| 777 qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_co
eff+0)); |
| 778 qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_co
eff+2)); |
| 779 if (order == 6) |
| 780 qlp[2] = _mm_loadl_epi64((const __m128i*
)(qlp_coeff+4)); |
| 781 else |
| 782 qlp[2] = _mm_cvtsi32_si128(qlp_coeff[4])
; |
| 783 |
| 784 qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2
,0,3,1)); |
| 785 qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2
,0,3,1)); |
| 786 qlp[2] = _mm_shuffle_epi32(qlp[2], _MM_SHUFFLE(2
,0,3,1)); |
| 787 |
| 788 dat[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con
st __m128i*)(data-6 ))); |
| 789 dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con
st __m128i*)(data-4 ))); |
| 790 dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con
st __m128i*)(data-2 ))); |
| 791 |
| 792 summ = _mm_mul_epi32(dat[2],
qlp[2]) ; |
| 793 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1],
qlp[1])); |
| 794 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0],
qlp[0])); |
| 795 |
| 796 summ = _mm_add_epi64(summ, _mm_srli_si128(summ,
8)); |
| 797 summ = _mm_srl_epi64(summ, cnt); |
| 798 temp = _mm_cvtsi32_si128(residual[0]); |
| 799 temp = _mm_add_epi32(temp, summ); |
| 800 data[0] = _mm_cvtsi128_si32(temp); |
| 801 |
| 802 for(i = 1; i < (int)data_len; i++) { |
| 803 dat[2] = _mm_alignr_epi8(dat[1], dat[2],
8); |
| 804 dat[1] = _mm_alignr_epi8(dat[0], dat[1],
8); |
| 805 dat[0] = _mm_alignr_epi8(temp, dat[0],
8); |
| 806 |
| 807 summ = _mm_mul_epi32
(dat[2], qlp[2]) ; |
| 808 summ = _mm_add_epi64(summ, _mm_mul_epi32
(dat[1], qlp[1])); |
| 809 summ = _mm_add_epi64(summ, _mm_mul_epi32
(dat[0], qlp[0])); |
| 810 |
| 811 summ = _mm_add_epi64(summ, _mm_srli_si12
8(summ, 8)); |
| 812 summ = _mm_srl_epi64(summ, cnt); |
| 813 temp = _mm_cvtsi32_si128(residual[i]); |
| 814 temp = _mm_add_epi32(temp, summ); |
| 815 data[i] = _mm_cvtsi128_si32(temp); |
| 816 } |
| 817 } |
| 818 } |
| 819 else { /* order == 1, 2, 3, 4 */ |
| 820 if(order > 2) { /* order == 3, 4 */ |
| 821 __m128i qlp[2], dat[2]; |
| 822 __m128i summ, temp; |
| 823 qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_co
eff+0)); |
| 824 if (order == 4) |
| 825 qlp[1] = _mm_loadl_epi64((const __m128i*
)(qlp_coeff+2)); |
| 826 else |
| 827 qlp[1] = _mm_cvtsi32_si128(qlp_coeff[2])
; |
| 828 |
| 829 qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2
,0,3,1)); |
| 830 qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2
,0,3,1)); |
| 831 |
| 832 dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con
st __m128i*)(data-4 ))); |
| 833 dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con
st __m128i*)(data-2 ))); |
| 834 |
| 835 summ = _mm_mul_epi32(dat[1],
qlp[1]) ; |
| 836 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0],
qlp[0])); |
| 837 |
| 838 summ = _mm_add_epi64(summ, _mm_srli_si128(summ,
8)); |
| 839 summ = _mm_srl_epi64(summ, cnt); |
| 840 temp = _mm_cvtsi32_si128(residual[0]); |
| 841 temp = _mm_add_epi32(temp, summ); |
| 842 data[0] = _mm_cvtsi128_si32(temp); |
| 843 |
| 844 for(i = 1; i < (int)data_len; i++) { |
| 845 dat[1] = _mm_alignr_epi8(dat[0], dat[1],
8); |
| 846 dat[0] = _mm_alignr_epi8(temp, dat[0],
8); |
| 847 |
| 848 summ = _mm_mul_epi32
(dat[1], qlp[1]) ; |
| 849 summ = _mm_add_epi64(summ, _mm_mul_epi32
(dat[0], qlp[0])); |
| 850 |
| 851 summ = _mm_add_epi64(summ, _mm_srli_si12
8(summ, 8)); |
| 852 summ = _mm_srl_epi64(summ, cnt); |
| 853 temp = _mm_cvtsi32_si128(residual[i]); |
| 854 temp = _mm_add_epi32(temp, summ); |
| 855 data[i] = _mm_cvtsi128_si32(temp); |
| 856 } |
| 857 } |
| 858 else { /* order == 1, 2 */ |
| 859 if(order == 2) { |
| 860 __m128i qlp0, dat0; |
| 861 __m128i summ, temp; |
| 862 qlp0 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff)); |
| 863 qlp0 = _mm_shuffle_epi32(qlp0, _MM_SHUFF
LE(2,0,3,1)); |
| 864 |
| 865 dat0 = _mm_cvtepu32_epi64(_mm_loadl_epi6
4((const __m128i*)(data-2 ))); |
| 866 |
| 867 summ = _mm_mul_epi32(dat0, qlp0) ; |
| 868 |
| 869 summ = _mm_add_epi64(summ, _mm_srli_si12
8(summ, 8)); |
| 870 summ = _mm_srl_epi64(summ, cnt); |
| 871 temp = _mm_cvtsi32_si128(residual[0]); |
| 872 temp = _mm_add_epi32(temp, summ); |
| 873 data[0] = _mm_cvtsi128_si32(temp); |
| 874 |
| 875 for(i = 1; i < (int)data_len; i++) { |
| 876 dat0 = _mm_alignr_epi8(temp, dat
0, 8); |
| 877 |
| 878 summ = _mm_mul_epi32(dat0, qlp0)
; |
| 879 |
| 880 summ = _mm_add_epi64(summ, _mm_s
rli_si128(summ, 8)); |
| 881 summ = _mm_srl_epi64(summ, cnt); |
| 882 temp = _mm_cvtsi32_si128(residua
l[i]); |
| 883 temp = _mm_add_epi32(temp, summ)
; |
| 884 data[i] = _mm_cvtsi128_si32(temp
); |
| 885 } |
| 886 } |
| 887 else { /* order == 1 */ |
| 888 __m128i qlp0; |
| 889 __m128i summ, temp; |
| 890 qlp0 = _mm_cvtsi32_si128(qlp_coeff[0]); |
| 891 temp = _mm_cvtsi32_si128(data[-1]); |
| 892 |
| 893 summ = _mm_mul_epi32(temp, qlp0); |
| 894 summ = _mm_srl_epi64(summ, cnt); |
| 895 temp = _mm_cvtsi32_si128(residual[0]); |
| 896 temp = _mm_add_epi32(temp, summ); |
| 897 data[0] = _mm_cvtsi128_si32(temp); |
| 898 |
| 899 for(i = 1; i < (int)data_len; i++) { |
| 900 summ = _mm_mul_epi32(temp, qlp0)
; |
| 901 summ = _mm_srl_epi64(summ, cnt); |
| 902 temp = _mm_cvtsi32_si128(residua
l[i]); |
| 903 temp = _mm_add_epi32(temp, summ)
; |
| 904 data[i] = _mm_cvtsi128_si32(temp
); |
| 905 } |
| 906 } |
| 907 } |
| 908 } |
| 909 } |
| 910 else { /* order > 12 */ |
| 911 FLAC__int64 sum; |
| 912 for(i = 0; i < (int)data_len; i++) { |
| 913 sum = 0; |
| 914 switch(order) { |
| 915 case 32: sum += qlp_coeff[31] * (FLAC__int64)dat
a[i-32]; |
| 916 case 31: sum += qlp_coeff[30] * (FLAC__int64)dat
a[i-31]; |
| 917 case 30: sum += qlp_coeff[29] * (FLAC__int64)dat
a[i-30]; |
| 918 case 29: sum += qlp_coeff[28] * (FLAC__int64)dat
a[i-29]; |
| 919 case 28: sum += qlp_coeff[27] * (FLAC__int64)dat
a[i-28]; |
| 920 case 27: sum += qlp_coeff[26] * (FLAC__int64)dat
a[i-27]; |
| 921 case 26: sum += qlp_coeff[25] * (FLAC__int64)dat
a[i-26]; |
| 922 case 25: sum += qlp_coeff[24] * (FLAC__int64)dat
a[i-25]; |
| 923 case 24: sum += qlp_coeff[23] * (FLAC__int64)dat
a[i-24]; |
| 924 case 23: sum += qlp_coeff[22] * (FLAC__int64)dat
a[i-23]; |
| 925 case 22: sum += qlp_coeff[21] * (FLAC__int64)dat
a[i-22]; |
| 926 case 21: sum += qlp_coeff[20] * (FLAC__int64)dat
a[i-21]; |
| 927 case 20: sum += qlp_coeff[19] * (FLAC__int64)dat
a[i-20]; |
| 928 case 19: sum += qlp_coeff[18] * (FLAC__int64)dat
a[i-19]; |
| 929 case 18: sum += qlp_coeff[17] * (FLAC__int64)dat
a[i-18]; |
| 930 case 17: sum += qlp_coeff[16] * (FLAC__int64)dat
a[i-17]; |
| 931 case 16: sum += qlp_coeff[15] * (FLAC__int64)dat
a[i-16]; |
| 932 case 15: sum += qlp_coeff[14] * (FLAC__int64)dat
a[i-15]; |
| 933 case 14: sum += qlp_coeff[13] * (FLAC__int64)dat
a[i-14]; |
| 934 case 13: sum += qlp_coeff[12] * (FLAC__int64)dat
a[i-13]; |
| 935 sum += qlp_coeff[11] * (FLAC__int64)dat
a[i-12]; |
| 936 sum += qlp_coeff[10] * (FLAC__int64)dat
a[i-11]; |
| 937 sum += qlp_coeff[ 9] * (FLAC__int64)dat
a[i-10]; |
| 938 sum += qlp_coeff[ 8] * (FLAC__int64)dat
a[i- 9]; |
| 939 sum += qlp_coeff[ 7] * (FLAC__int64)dat
a[i- 8]; |
| 940 sum += qlp_coeff[ 6] * (FLAC__int64)dat
a[i- 7]; |
| 941 sum += qlp_coeff[ 5] * (FLAC__int64)dat
a[i- 6]; |
| 942 sum += qlp_coeff[ 4] * (FLAC__int64)dat
a[i- 5]; |
| 943 sum += qlp_coeff[ 3] * (FLAC__int64)dat
a[i- 4]; |
| 944 sum += qlp_coeff[ 2] * (FLAC__int64)dat
a[i- 3]; |
| 945 sum += qlp_coeff[ 1] * (FLAC__int64)dat
a[i- 2]; |
| 946 sum += qlp_coeff[ 0] * (FLAC__int64)dat
a[i- 1]; |
| 947 } |
| 948 data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantiza
tion); |
| 949 } |
| 950 } |
| 951 } |
| 952 |
| 953 #endif /* defined FLAC__CPU_IA32 */ |
| 954 |
| 955 FLAC__SSE_TARGET("sse4.1") |
| 956 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__i
nt32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, in
t lp_quantization, FLAC__int32 residual[]) |
| 957 { |
| 958 int i; |
| 959 FLAC__int32 sum; |
| 960 __m128i cnt = _mm_cvtsi32_si128(lp_quantization); |
| 961 |
| 962 FLAC__ASSERT(order > 0); |
| 963 FLAC__ASSERT(order <= 32); |
| 964 |
| 965 if(order <= 12) { |
| 966 if(order > 8) { |
| 967 if(order > 10) { |
| 968 if(order == 12) { |
| 969 __m128i q0, q1, q2, q3, q4, q5, q6, q7,
q8, q9, q10, q11; |
| 970 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0
= _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
| 971 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1
= _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
| 972 q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2
= _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
| 973 q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3
= _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
| 974 q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4
= _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
| 975 q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5
= _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); |
| 976 q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6
= _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); |
| 977 q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7
= _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); |
| 978 q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8
= _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0)); |
| 979 q9 = _mm_cvtsi32_si128(qlp_coeff[9]); q9
= _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0)); |
| 980 q10 = _mm_cvtsi32_si128(qlp_coeff[10]);
q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0)); |
| 981 q11 = _mm_cvtsi32_si128(qlp_coeff[11]);
q11 = _mm_shuffle_epi32(q11, _MM_SHUFFLE(0,0,0,0)); |
| 982 |
| 983 for(i = 0; i < (int)data_len-3; i+=4) { |
| 984 __m128i summ, mull; |
| 985 summ = _mm_mullo_epi32(q11, _mm_
loadu_si128((const __m128i*)(data+i-12))); |
| 986 mull = _mm_mullo_epi32(q10, _mm_
loadu_si128((const __m128i*)(data+i-11))); summ = _mm_add_epi32(summ, mull); |
| 987 mull = _mm_mullo_epi32(q9, _mm_l
oadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull); |
| 988 mull = _mm_mullo_epi32(q8, _mm_l
oadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull); |
| 989 mull = _mm_mullo_epi32(q7, _mm_l
oadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull); |
| 990 mull = _mm_mullo_epi32(q6, _mm_l
oadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); |
| 991 mull = _mm_mullo_epi32(q5, _mm_l
oadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); |
| 992 mull = _mm_mullo_epi32(q4, _mm_l
oadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); |
| 993 mull = _mm_mullo_epi32(q3, _mm_l
oadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
| 994 mull = _mm_mullo_epi32(q2, _mm_l
oadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
| 995 mull = _mm_mullo_epi32(q1, _mm_l
oadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
| 996 mull = _mm_mullo_epi32(q0, _mm_l
oadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
| 997 summ = _mm_sra_epi32(summ, cnt); |
| 998 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
| 999 } |
| 1000 } |
| 1001 else { /* order == 11 */ |
| 1002 __m128i q0, q1, q2, q3, q4, q5, q6, q7,
q8, q9, q10; |
| 1003 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0
= _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
| 1004 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1
= _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
| 1005 q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2
= _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
| 1006 q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3
= _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
| 1007 q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4
= _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
| 1008 q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5
= _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); |
| 1009 q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6
= _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); |
| 1010 q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7
= _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); |
| 1011 q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8
= _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0)); |
| 1012 q9 = _mm_cvtsi32_si128(qlp_coeff[9]); q9
= _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0)); |
| 1013 q10 = _mm_cvtsi32_si128(qlp_coeff[10]);
q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0)); |
| 1014 |
| 1015 for(i = 0; i < (int)data_len-3; i+=4) { |
| 1016 __m128i summ, mull; |
| 1017 summ = _mm_mullo_epi32(q10, _mm_
loadu_si128((const __m128i*)(data+i-11))); |
| 1018 mull = _mm_mullo_epi32(q9, _mm_l
oadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull); |
| 1019 mull = _mm_mullo_epi32(q8, _mm_l
oadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull); |
| 1020 mull = _mm_mullo_epi32(q7, _mm_l
oadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull); |
| 1021 mull = _mm_mullo_epi32(q6, _mm_l
oadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); |
| 1022 mull = _mm_mullo_epi32(q5, _mm_l
oadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); |
| 1023 mull = _mm_mullo_epi32(q4, _mm_l
oadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); |
| 1024 mull = _mm_mullo_epi32(q3, _mm_l
oadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
| 1025 mull = _mm_mullo_epi32(q2, _mm_l
oadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
| 1026 mull = _mm_mullo_epi32(q1, _mm_l
oadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
| 1027 mull = _mm_mullo_epi32(q0, _mm_l
oadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
| 1028 summ = _mm_sra_epi32(summ, cnt); |
| 1029 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
| 1030 } |
| 1031 } |
| 1032 } |
| 1033 else { |
| 1034 if(order == 10) { |
| 1035 __m128i q0, q1, q2, q3, q4, q5, q6, q7,
q8, q9; |
| 1036 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0
= _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
| 1037 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1
= _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
| 1038 q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2
= _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
| 1039 q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3
= _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
| 1040 q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4
= _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
| 1041 q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5
= _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); |
| 1042 q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6
= _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); |
| 1043 q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7
= _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); |
| 1044 q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8
= _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0)); |
| 1045 q9 = _mm_cvtsi32_si128(qlp_coeff[9]); q9
= _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0)); |
| 1046 |
| 1047 for(i = 0; i < (int)data_len-3; i+=4) { |
| 1048 __m128i summ, mull; |
| 1049 summ = _mm_mullo_epi32(q9, _mm_l
oadu_si128((const __m128i*)(data+i-10))); |
| 1050 mull = _mm_mullo_epi32(q8, _mm_l
oadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull); |
| 1051 mull = _mm_mullo_epi32(q7, _mm_l
oadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull); |
| 1052 mull = _mm_mullo_epi32(q6, _mm_l
oadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); |
| 1053 mull = _mm_mullo_epi32(q5, _mm_l
oadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); |
| 1054 mull = _mm_mullo_epi32(q4, _mm_l
oadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); |
| 1055 mull = _mm_mullo_epi32(q3, _mm_l
oadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
| 1056 mull = _mm_mullo_epi32(q2, _mm_l
oadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
| 1057 mull = _mm_mullo_epi32(q1, _mm_l
oadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
| 1058 mull = _mm_mullo_epi32(q0, _mm_l
oadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
| 1059 summ = _mm_sra_epi32(summ, cnt); |
| 1060 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
| 1061 } |
| 1062 } |
| 1063 else { /* order == 9 */ |
| 1064 __m128i q0, q1, q2, q3, q4, q5, q6, q7,
q8; |
| 1065 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0
= _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
| 1066 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1
= _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
| 1067 q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2
= _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
| 1068 q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3
= _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
| 1069 q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4
= _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
| 1070 q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5
= _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); |
| 1071 q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6
= _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); |
| 1072 q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7
= _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); |
| 1073 q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8
= _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0)); |
| 1074 |
| 1075 for(i = 0; i < (int)data_len-3; i+=4) { |
| 1076 __m128i summ, mull; |
| 1077 summ = _mm_mullo_epi32(q8, _mm_l
oadu_si128((const __m128i*)(data+i-9))); |
| 1078 mull = _mm_mullo_epi32(q7, _mm_l
oadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull); |
| 1079 mull = _mm_mullo_epi32(q6, _mm_l
oadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); |
| 1080 mull = _mm_mullo_epi32(q5, _mm_l
oadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); |
| 1081 mull = _mm_mullo_epi32(q4, _mm_l
oadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); |
| 1082 mull = _mm_mullo_epi32(q3, _mm_l
oadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
| 1083 mull = _mm_mullo_epi32(q2, _mm_l
oadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
| 1084 mull = _mm_mullo_epi32(q1, _mm_l
oadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
| 1085 mull = _mm_mullo_epi32(q0, _mm_l
oadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
| 1086 summ = _mm_sra_epi32(summ, cnt); |
| 1087 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
| 1088 } |
| 1089 } |
| 1090 } |
| 1091 } |
| 1092 else if(order > 4) { |
| 1093 if(order > 6) { |
| 1094 if(order == 8) { |
| 1095 __m128i q0, q1, q2, q3, q4, q5, q6, q7; |
| 1096 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0
= _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
| 1097 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1
= _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
| 1098 q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2
= _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
| 1099 q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3
= _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
| 1100 q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4
= _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
| 1101 q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5
= _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); |
| 1102 q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6
= _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); |
| 1103 q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7
= _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); |
| 1104 |
| 1105 for(i = 0; i < (int)data_len-3; i+=4) { |
| 1106 __m128i summ, mull; |
| 1107 summ = _mm_mullo_epi32(q7, _mm_l
oadu_si128((const __m128i*)(data+i-8))); |
| 1108 mull = _mm_mullo_epi32(q6, _mm_l
oadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); |
| 1109 mull = _mm_mullo_epi32(q5, _mm_l
oadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); |
| 1110 mull = _mm_mullo_epi32(q4, _mm_l
oadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); |
| 1111 mull = _mm_mullo_epi32(q3, _mm_l
oadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
| 1112 mull = _mm_mullo_epi32(q2, _mm_l
oadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
| 1113 mull = _mm_mullo_epi32(q1, _mm_l
oadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
| 1114 mull = _mm_mullo_epi32(q0, _mm_l
oadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
| 1115 summ = _mm_sra_epi32(summ, cnt); |
| 1116 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
| 1117 } |
| 1118 } |
| 1119 else { /* order == 7 */ |
| 1120 __m128i q0, q1, q2, q3, q4, q5, q6; |
| 1121 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0
= _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
| 1122 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1
= _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
| 1123 q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2
= _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
| 1124 q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3
= _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
| 1125 q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4
= _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
| 1126 q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5
= _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); |
| 1127 q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6
= _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); |
| 1128 |
| 1129 for(i = 0; i < (int)data_len-3; i+=4) { |
| 1130 __m128i summ, mull; |
| 1131 summ = _mm_mullo_epi32(q6, _mm_l
oadu_si128((const __m128i*)(data+i-7))); |
| 1132 mull = _mm_mullo_epi32(q5, _mm_l
oadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); |
| 1133 mull = _mm_mullo_epi32(q4, _mm_l
oadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); |
| 1134 mull = _mm_mullo_epi32(q3, _mm_l
oadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
| 1135 mull = _mm_mullo_epi32(q2, _mm_l
oadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
| 1136 mull = _mm_mullo_epi32(q1, _mm_l
oadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
| 1137 mull = _mm_mullo_epi32(q0, _mm_l
oadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
| 1138 summ = _mm_sra_epi32(summ, cnt); |
| 1139 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
| 1140 } |
| 1141 } |
| 1142 } |
| 1143 else { |
| 1144 if(order == 6) { |
| 1145 __m128i q0, q1, q2, q3, q4, q5; |
| 1146 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0
= _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
| 1147 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1
= _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
| 1148 q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2
= _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
| 1149 q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3
= _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
| 1150 q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4
= _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
| 1151 q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5
= _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); |
| 1152 |
| 1153 for(i = 0; i < (int)data_len-3; i+=4) { |
| 1154 __m128i summ, mull; |
| 1155 summ = _mm_mullo_epi32(q5, _mm_l
oadu_si128((const __m128i*)(data+i-6))); |
| 1156 mull = _mm_mullo_epi32(q4, _mm_l
oadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); |
| 1157 mull = _mm_mullo_epi32(q3, _mm_l
oadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
| 1158 mull = _mm_mullo_epi32(q2, _mm_l
oadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
| 1159 mull = _mm_mullo_epi32(q1, _mm_l
oadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
| 1160 mull = _mm_mullo_epi32(q0, _mm_l
oadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
| 1161 summ = _mm_sra_epi32(summ, cnt); |
| 1162 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
| 1163 } |
| 1164 } |
| 1165 else { /* order == 5 */ |
| 1166 __m128i q0, q1, q2, q3, q4; |
| 1167 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0
= _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
| 1168 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1
= _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
| 1169 q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2
= _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
| 1170 q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3
= _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
| 1171 q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4
= _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
| 1172 |
| 1173 for(i = 0; i < (int)data_len-3; i+=4) { |
| 1174 __m128i summ, mull; |
| 1175 summ = _mm_mullo_epi32(q4, _mm_l
oadu_si128((const __m128i*)(data+i-5))); |
| 1176 mull = _mm_mullo_epi32(q3, _mm_l
oadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
| 1177 mull = _mm_mullo_epi32(q2, _mm_l
oadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
| 1178 mull = _mm_mullo_epi32(q1, _mm_l
oadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
| 1179 mull = _mm_mullo_epi32(q0, _mm_l
oadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
| 1180 summ = _mm_sra_epi32(summ, cnt); |
| 1181 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
| 1182 } |
| 1183 } |
| 1184 } |
| 1185 } |
| 1186 else { |
| 1187 if(order > 2) { |
| 1188 if(order == 4) { |
| 1189 __m128i q0, q1, q2, q3; |
| 1190 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0
= _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
| 1191 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1
= _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
| 1192 q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2
= _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
| 1193 q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3
= _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
| 1194 |
| 1195 for(i = 0; i < (int)data_len-3; i+=4) { |
| 1196 __m128i summ, mull; |
| 1197 summ = _mm_mullo_epi32(q3, _mm_l
oadu_si128((const __m128i*)(data+i-4))); |
| 1198 mull = _mm_mullo_epi32(q2, _mm_l
oadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
| 1199 mull = _mm_mullo_epi32(q1, _mm_l
oadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
| 1200 mull = _mm_mullo_epi32(q0, _mm_l
oadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
| 1201 summ = _mm_sra_epi32(summ, cnt); |
| 1202 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
| 1203 } |
| 1204 } |
| 1205 else { /* order == 3 */ |
| 1206 __m128i q0, q1, q2; |
| 1207 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0
= _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
| 1208 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1
= _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
| 1209 q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2
= _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
| 1210 |
| 1211 for(i = 0; i < (int)data_len-3; i+=4) { |
| 1212 __m128i summ, mull; |
| 1213 summ = _mm_mullo_epi32(q2, _mm_l
oadu_si128((const __m128i*)(data+i-3))); |
| 1214 mull = _mm_mullo_epi32(q1, _mm_l
oadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
| 1215 mull = _mm_mullo_epi32(q0, _mm_l
oadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
| 1216 summ = _mm_sra_epi32(summ, cnt); |
| 1217 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
| 1218 } |
| 1219 } |
| 1220 } |
| 1221 else { |
| 1222 if(order == 2) { |
| 1223 __m128i q0, q1; |
| 1224 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0
= _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
| 1225 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1
= _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
| 1226 |
| 1227 for(i = 0; i < (int)data_len-3; i+=4) { |
| 1228 __m128i summ, mull; |
| 1229 summ = _mm_mullo_epi32(q1, _mm_l
oadu_si128((const __m128i*)(data+i-2))); |
| 1230 mull = _mm_mullo_epi32(q0, _mm_l
oadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
| 1231 summ = _mm_sra_epi32(summ, cnt); |
| 1232 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
| 1233 } |
| 1234 } |
| 1235 else { /* order == 1 */ |
| 1236 __m128i q0; |
| 1237 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0
= _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
| 1238 |
| 1239 for(i = 0; i < (int)data_len-3; i+=4) { |
| 1240 __m128i summ; |
| 1241 summ = _mm_mullo_epi32(q0, _mm_l
oadu_si128((const __m128i*)(data+i-1))); |
| 1242 summ = _mm_sra_epi32(summ, cnt); |
| 1243 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
| 1244 } |
| 1245 } |
| 1246 } |
| 1247 } |
| 1248 for(; i < (int)data_len; i++) { |
| 1249 sum = 0; |
| 1250 switch(order) { |
| 1251 case 12: sum += qlp_coeff[11] * data[i-12]; |
| 1252 case 11: sum += qlp_coeff[10] * data[i-11]; |
| 1253 case 10: sum += qlp_coeff[ 9] * data[i-10]; |
| 1254 case 9: sum += qlp_coeff[ 8] * data[i- 9]; |
| 1255 case 8: sum += qlp_coeff[ 7] * data[i- 8]; |
| 1256 case 7: sum += qlp_coeff[ 6] * data[i- 7]; |
| 1257 case 6: sum += qlp_coeff[ 5] * data[i- 6]; |
| 1258 case 5: sum += qlp_coeff[ 4] * data[i- 5]; |
| 1259 case 4: sum += qlp_coeff[ 3] * data[i- 4]; |
| 1260 case 3: sum += qlp_coeff[ 2] * data[i- 3]; |
| 1261 case 2: sum += qlp_coeff[ 1] * data[i- 2]; |
| 1262 case 1: sum += qlp_coeff[ 0] * data[i- 1]; |
| 1263 } |
| 1264 residual[i] = data[i] - (sum >> lp_quantization); |
| 1265 } |
| 1266 } |
| 1267 else { /* order > 12 */ |
| 1268 for(i = 0; i < (int)data_len; i++) { |
| 1269 sum = 0; |
| 1270 switch(order) { |
| 1271 case 32: sum += qlp_coeff[31] * data[i-32]; |
| 1272 case 31: sum += qlp_coeff[30] * data[i-31]; |
| 1273 case 30: sum += qlp_coeff[29] * data[i-30]; |
| 1274 case 29: sum += qlp_coeff[28] * data[i-29]; |
| 1275 case 28: sum += qlp_coeff[27] * data[i-28]; |
| 1276 case 27: sum += qlp_coeff[26] * data[i-27]; |
| 1277 case 26: sum += qlp_coeff[25] * data[i-26]; |
| 1278 case 25: sum += qlp_coeff[24] * data[i-25]; |
| 1279 case 24: sum += qlp_coeff[23] * data[i-24]; |
| 1280 case 23: sum += qlp_coeff[22] * data[i-23]; |
| 1281 case 22: sum += qlp_coeff[21] * data[i-22]; |
| 1282 case 21: sum += qlp_coeff[20] * data[i-21]; |
| 1283 case 20: sum += qlp_coeff[19] * data[i-20]; |
| 1284 case 19: sum += qlp_coeff[18] * data[i-19]; |
| 1285 case 18: sum += qlp_coeff[17] * data[i-18]; |
| 1286 case 17: sum += qlp_coeff[16] * data[i-17]; |
| 1287 case 16: sum += qlp_coeff[15] * data[i-16]; |
| 1288 case 15: sum += qlp_coeff[14] * data[i-15]; |
| 1289 case 14: sum += qlp_coeff[13] * data[i-14]; |
| 1290 case 13: sum += qlp_coeff[12] * data[i-13]; |
| 1291 sum += qlp_coeff[11] * data[i-12]; |
| 1292 sum += qlp_coeff[10] * data[i-11]; |
| 1293 sum += qlp_coeff[ 9] * data[i-10]; |
| 1294 sum += qlp_coeff[ 8] * data[i- 9]; |
| 1295 sum += qlp_coeff[ 7] * data[i- 8]; |
| 1296 sum += qlp_coeff[ 6] * data[i- 7]; |
| 1297 sum += qlp_coeff[ 5] * data[i- 6]; |
| 1298 sum += qlp_coeff[ 4] * data[i- 5]; |
| 1299 sum += qlp_coeff[ 3] * data[i- 4]; |
| 1300 sum += qlp_coeff[ 2] * data[i- 3]; |
| 1301 sum += qlp_coeff[ 1] * data[i- 2]; |
| 1302 sum += qlp_coeff[ 0] * data[i- 1]; |
| 1303 } |
| 1304 residual[i] = data[i] - (sum >> lp_quantization); |
| 1305 } |
| 1306 } |
| 1307 } |
| 1308 |
| 1309 #endif /* FLAC__SSE4_1_SUPPORTED */ |
| 1310 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */ |
| 1311 #endif /* FLAC__NO_ASM */ |
| 1312 #endif /* FLAC__INTEGER_ONLY_LIBRARY */ |
OLD | NEW |