OLD | NEW |
(Empty) | |
| 1 /* libFLAC - Free Lossless Audio Codec library |
| 2 * Copyright (C) 2000-2009 Josh Coalson |
| 3 * Copyright (C) 2011-2014 Xiph.Org Foundation |
| 4 * |
| 5 * Redistribution and use in source and binary forms, with or without |
| 6 * modification, are permitted provided that the following conditions |
| 7 * are met: |
| 8 * |
| 9 * - Redistributions of source code must retain the above copyright |
| 10 * notice, this list of conditions and the following disclaimer. |
| 11 * |
| 12 * - Redistributions in binary form must reproduce the above copyright |
| 13 * notice, this list of conditions and the following disclaimer in the |
| 14 * documentation and/or other materials provided with the distribution. |
| 15 * |
| 16 * - Neither the name of the Xiph.org Foundation nor the names of its |
| 17 * contributors may be used to endorse or promote products derived from |
| 18 * this software without specific prior written permission. |
| 19 * |
| 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR |
| 24 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| 25 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| 26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| 27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
| 28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
| 29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| 30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 31 */ |
| 32 |
| 33 #ifdef HAVE_CONFIG_H |
| 34 # include <config.h> |
| 35 #endif |
| 36 |
| 37 #ifndef FLAC__INTEGER_ONLY_LIBRARY |
| 38 #ifndef FLAC__NO_ASM |
| 39 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X8
6INTRIN |
| 40 #include "private/lpc.h" |
| 41 #ifdef FLAC__AVX2_SUPPORTED |
| 42 |
| 43 #include "FLAC/assert.h" |
| 44 #include "FLAC/format.h" |
| 45 |
| 46 #include <immintrin.h> /* AVX2 */ |
| 47 |
| 48 FLAC__SSE_TARGET("avx2") |
| 49 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC_
_int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order,
int lp_quantization, FLAC__int32 residual[]) |
| 50 { |
| 51 int i; |
| 52 FLAC__int32 sum; |
| 53 __m128i cnt = _mm_cvtsi32_si128(lp_quantization); |
| 54 |
| 55 FLAC__ASSERT(order > 0); |
| 56 FLAC__ASSERT(order <= 32); |
| 57 |
| 58 if(order <= 12) { |
| 59 if(order > 8) { |
| 60 if(order > 10) { |
| 61 if(order == 12) { |
| 62 __m256i q0, q1, q2, q3, q4, q5, q6, q7,
q8, q9, q10, q11; |
| 63 q0 = _mm256_set1_epi32(0xffff & qlp_coe
ff[0 ]); |
| 64 q1 = _mm256_set1_epi32(0xffff & qlp_coe
ff[1 ]); |
| 65 q2 = _mm256_set1_epi32(0xffff & qlp_coe
ff[2 ]); |
| 66 q3 = _mm256_set1_epi32(0xffff & qlp_coe
ff[3 ]); |
| 67 q4 = _mm256_set1_epi32(0xffff & qlp_coe
ff[4 ]); |
| 68 q5 = _mm256_set1_epi32(0xffff & qlp_coe
ff[5 ]); |
| 69 q6 = _mm256_set1_epi32(0xffff & qlp_coe
ff[6 ]); |
| 70 q7 = _mm256_set1_epi32(0xffff & qlp_coe
ff[7 ]); |
| 71 q8 = _mm256_set1_epi32(0xffff & qlp_coe
ff[8 ]); |
| 72 q9 = _mm256_set1_epi32(0xffff & qlp_coe
ff[9 ]); |
| 73 q10 = _mm256_set1_epi32(0xffff & qlp_coe
ff[10]); |
| 74 q11 = _mm256_set1_epi32(0xffff & qlp_coe
ff[11]); |
| 75 |
| 76 for(i = 0; i < (int)data_len-7; i+=8) { |
| 77 __m256i summ, mull; |
| 78 summ = _mm256_madd_epi16(q11, _m
m256_loadu_si256((const __m256i*)(data+i-12))); |
| 79 mull = _mm256_madd_epi16(q10, _m
m256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mu
ll); |
| 80 mull = _mm256_madd_epi16(q9, _m
m256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mu
ll); |
| 81 mull = _mm256_madd_epi16(q8, _m
m256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 82 mull = _mm256_madd_epi16(q7, _m
m256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 83 mull = _mm256_madd_epi16(q6, _m
m256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 84 mull = _mm256_madd_epi16(q5, _m
m256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 85 mull = _mm256_madd_epi16(q4, _m
m256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 86 mull = _mm256_madd_epi16(q3, _m
m256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 87 mull = _mm256_madd_epi16(q2, _m
m256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 88 mull = _mm256_madd_epi16(q1, _m
m256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 89 mull = _mm256_madd_epi16(q0, _m
m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 90 summ = _mm256_sra_epi32(summ, cn
t); |
| 91 _mm256_storeu_si256((__m256i*)(r
esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)
); |
| 92 } |
| 93 } |
| 94 else { /* order == 11 */ |
| 95 __m256i q0, q1, q2, q3, q4, q5, q6, q7,
q8, q9, q10; |
| 96 q0 = _mm256_set1_epi32(0xffff & qlp_coe
ff[0 ]); |
| 97 q1 = _mm256_set1_epi32(0xffff & qlp_coe
ff[1 ]); |
| 98 q2 = _mm256_set1_epi32(0xffff & qlp_coe
ff[2 ]); |
| 99 q3 = _mm256_set1_epi32(0xffff & qlp_coe
ff[3 ]); |
| 100 q4 = _mm256_set1_epi32(0xffff & qlp_coe
ff[4 ]); |
| 101 q5 = _mm256_set1_epi32(0xffff & qlp_coe
ff[5 ]); |
| 102 q6 = _mm256_set1_epi32(0xffff & qlp_coe
ff[6 ]); |
| 103 q7 = _mm256_set1_epi32(0xffff & qlp_coe
ff[7 ]); |
| 104 q8 = _mm256_set1_epi32(0xffff & qlp_coe
ff[8 ]); |
| 105 q9 = _mm256_set1_epi32(0xffff & qlp_coe
ff[9 ]); |
| 106 q10 = _mm256_set1_epi32(0xffff & qlp_coe
ff[10]); |
| 107 |
| 108 for(i = 0; i < (int)data_len-7; i+=8) { |
| 109 __m256i summ, mull; |
| 110 summ = _mm256_madd_epi16(q10, _m
m256_loadu_si256((const __m256i*)(data+i-11))); |
| 111 mull = _mm256_madd_epi16(q9, _m
m256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mu
ll); |
| 112 mull = _mm256_madd_epi16(q8, _m
m256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 113 mull = _mm256_madd_epi16(q7, _m
m256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 114 mull = _mm256_madd_epi16(q6, _m
m256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 115 mull = _mm256_madd_epi16(q5, _m
m256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 116 mull = _mm256_madd_epi16(q4, _m
m256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 117 mull = _mm256_madd_epi16(q3, _m
m256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 118 mull = _mm256_madd_epi16(q2, _m
m256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 119 mull = _mm256_madd_epi16(q1, _m
m256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 120 mull = _mm256_madd_epi16(q0, _m
m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 121 summ = _mm256_sra_epi32(summ, cn
t); |
| 122 _mm256_storeu_si256((__m256i*)(r
esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)
); |
| 123 } |
| 124 } |
| 125 } |
| 126 else { |
| 127 if(order == 10) { |
| 128 __m256i q0, q1, q2, q3, q4, q5, q6, q7,
q8, q9; |
| 129 q0 = _mm256_set1_epi32(0xffff & qlp_coe
ff[0 ]); |
| 130 q1 = _mm256_set1_epi32(0xffff & qlp_coe
ff[1 ]); |
| 131 q2 = _mm256_set1_epi32(0xffff & qlp_coe
ff[2 ]); |
| 132 q3 = _mm256_set1_epi32(0xffff & qlp_coe
ff[3 ]); |
| 133 q4 = _mm256_set1_epi32(0xffff & qlp_coe
ff[4 ]); |
| 134 q5 = _mm256_set1_epi32(0xffff & qlp_coe
ff[5 ]); |
| 135 q6 = _mm256_set1_epi32(0xffff & qlp_coe
ff[6 ]); |
| 136 q7 = _mm256_set1_epi32(0xffff & qlp_coe
ff[7 ]); |
| 137 q8 = _mm256_set1_epi32(0xffff & qlp_coe
ff[8 ]); |
| 138 q9 = _mm256_set1_epi32(0xffff & qlp_coe
ff[9 ]); |
| 139 |
| 140 for(i = 0; i < (int)data_len-7; i+=8) { |
| 141 __m256i summ, mull; |
| 142 summ = _mm256_madd_epi16(q9, _m
m256_loadu_si256((const __m256i*)(data+i-10))); |
| 143 mull = _mm256_madd_epi16(q8, _m
m256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 144 mull = _mm256_madd_epi16(q7, _m
m256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 145 mull = _mm256_madd_epi16(q6, _m
m256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 146 mull = _mm256_madd_epi16(q5, _m
m256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 147 mull = _mm256_madd_epi16(q4, _m
m256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 148 mull = _mm256_madd_epi16(q3, _m
m256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 149 mull = _mm256_madd_epi16(q2, _m
m256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 150 mull = _mm256_madd_epi16(q1, _m
m256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 151 mull = _mm256_madd_epi16(q0, _m
m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 152 summ = _mm256_sra_epi32(summ, cn
t); |
| 153 _mm256_storeu_si256((__m256i*)(r
esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)
); |
| 154 } |
| 155 } |
| 156 else { /* order == 9 */ |
| 157 __m256i q0, q1, q2, q3, q4, q5, q6, q7,
q8; |
| 158 q0 = _mm256_set1_epi32(0xffff & qlp_coe
ff[0 ]); |
| 159 q1 = _mm256_set1_epi32(0xffff & qlp_coe
ff[1 ]); |
| 160 q2 = _mm256_set1_epi32(0xffff & qlp_coe
ff[2 ]); |
| 161 q3 = _mm256_set1_epi32(0xffff & qlp_coe
ff[3 ]); |
| 162 q4 = _mm256_set1_epi32(0xffff & qlp_coe
ff[4 ]); |
| 163 q5 = _mm256_set1_epi32(0xffff & qlp_coe
ff[5 ]); |
| 164 q6 = _mm256_set1_epi32(0xffff & qlp_coe
ff[6 ]); |
| 165 q7 = _mm256_set1_epi32(0xffff & qlp_coe
ff[7 ]); |
| 166 q8 = _mm256_set1_epi32(0xffff & qlp_coe
ff[8 ]); |
| 167 |
| 168 for(i = 0; i < (int)data_len-7; i+=8) { |
| 169 __m256i summ, mull; |
| 170 summ = _mm256_madd_epi16(q8, _m
m256_loadu_si256((const __m256i*)(data+i-9 ))); |
| 171 mull = _mm256_madd_epi16(q7, _m
m256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 172 mull = _mm256_madd_epi16(q6, _m
m256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 173 mull = _mm256_madd_epi16(q5, _m
m256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 174 mull = _mm256_madd_epi16(q4, _m
m256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 175 mull = _mm256_madd_epi16(q3, _m
m256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 176 mull = _mm256_madd_epi16(q2, _m
m256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 177 mull = _mm256_madd_epi16(q1, _m
m256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 178 mull = _mm256_madd_epi16(q0, _m
m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 179 summ = _mm256_sra_epi32(summ, cn
t); |
| 180 _mm256_storeu_si256((__m256i*)(r
esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)
); |
| 181 } |
| 182 } |
| 183 } |
| 184 } |
| 185 else if(order > 4) { |
| 186 if(order > 6) { |
| 187 if(order == 8) { |
| 188 __m256i q0, q1, q2, q3, q4, q5, q6, q7; |
| 189 q0 = _mm256_set1_epi32(0xffff & qlp_coe
ff[0 ]); |
| 190 q1 = _mm256_set1_epi32(0xffff & qlp_coe
ff[1 ]); |
| 191 q2 = _mm256_set1_epi32(0xffff & qlp_coe
ff[2 ]); |
| 192 q3 = _mm256_set1_epi32(0xffff & qlp_coe
ff[3 ]); |
| 193 q4 = _mm256_set1_epi32(0xffff & qlp_coe
ff[4 ]); |
| 194 q5 = _mm256_set1_epi32(0xffff & qlp_coe
ff[5 ]); |
| 195 q6 = _mm256_set1_epi32(0xffff & qlp_coe
ff[6 ]); |
| 196 q7 = _mm256_set1_epi32(0xffff & qlp_coe
ff[7 ]); |
| 197 |
| 198 for(i = 0; i < (int)data_len-7; i+=8) { |
| 199 __m256i summ, mull; |
| 200 summ = _mm256_madd_epi16(q7, _m
m256_loadu_si256((const __m256i*)(data+i-8 ))); |
| 201 mull = _mm256_madd_epi16(q6, _m
m256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 202 mull = _mm256_madd_epi16(q5, _m
m256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 203 mull = _mm256_madd_epi16(q4, _m
m256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 204 mull = _mm256_madd_epi16(q3, _m
m256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 205 mull = _mm256_madd_epi16(q2, _m
m256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 206 mull = _mm256_madd_epi16(q1, _m
m256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 207 mull = _mm256_madd_epi16(q0, _m
m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 208 summ = _mm256_sra_epi32(summ, cn
t); |
| 209 _mm256_storeu_si256((__m256i*)(r
esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)
); |
| 210 } |
| 211 } |
| 212 else { /* order == 7 */ |
| 213 __m256i q0, q1, q2, q3, q4, q5, q6; |
| 214 q0 = _mm256_set1_epi32(0xffff & qlp_coe
ff[0 ]); |
| 215 q1 = _mm256_set1_epi32(0xffff & qlp_coe
ff[1 ]); |
| 216 q2 = _mm256_set1_epi32(0xffff & qlp_coe
ff[2 ]); |
| 217 q3 = _mm256_set1_epi32(0xffff & qlp_coe
ff[3 ]); |
| 218 q4 = _mm256_set1_epi32(0xffff & qlp_coe
ff[4 ]); |
| 219 q5 = _mm256_set1_epi32(0xffff & qlp_coe
ff[5 ]); |
| 220 q6 = _mm256_set1_epi32(0xffff & qlp_coe
ff[6 ]); |
| 221 |
| 222 for(i = 0; i < (int)data_len-7; i+=8) { |
| 223 __m256i summ, mull; |
| 224 summ = _mm256_madd_epi16(q6, _m
m256_loadu_si256((const __m256i*)(data+i-7 ))); |
| 225 mull = _mm256_madd_epi16(q5, _m
m256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 226 mull = _mm256_madd_epi16(q4, _m
m256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 227 mull = _mm256_madd_epi16(q3, _m
m256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 228 mull = _mm256_madd_epi16(q2, _m
m256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 229 mull = _mm256_madd_epi16(q1, _m
m256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 230 mull = _mm256_madd_epi16(q0, _m
m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 231 summ = _mm256_sra_epi32(summ, cn
t); |
| 232 _mm256_storeu_si256((__m256i*)(r
esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)
); |
| 233 } |
| 234 } |
| 235 } |
| 236 else { |
| 237 if(order == 6) { |
| 238 __m256i q0, q1, q2, q3, q4, q5; |
| 239 q0 = _mm256_set1_epi32(0xffff & qlp_coe
ff[0 ]); |
| 240 q1 = _mm256_set1_epi32(0xffff & qlp_coe
ff[1 ]); |
| 241 q2 = _mm256_set1_epi32(0xffff & qlp_coe
ff[2 ]); |
| 242 q3 = _mm256_set1_epi32(0xffff & qlp_coe
ff[3 ]); |
| 243 q4 = _mm256_set1_epi32(0xffff & qlp_coe
ff[4 ]); |
| 244 q5 = _mm256_set1_epi32(0xffff & qlp_coe
ff[5 ]); |
| 245 |
| 246 for(i = 0; i < (int)data_len-7; i+=8) { |
| 247 __m256i summ, mull; |
| 248 summ = _mm256_madd_epi16(q5, _m
m256_loadu_si256((const __m256i*)(data+i-6 ))); |
| 249 mull = _mm256_madd_epi16(q4, _m
m256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 250 mull = _mm256_madd_epi16(q3, _m
m256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 251 mull = _mm256_madd_epi16(q2, _m
m256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 252 mull = _mm256_madd_epi16(q1, _m
m256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 253 mull = _mm256_madd_epi16(q0, _m
m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 254 summ = _mm256_sra_epi32(summ, cn
t); |
| 255 _mm256_storeu_si256((__m256i*)(r
esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)
); |
| 256 } |
| 257 } |
| 258 else { /* order == 5 */ |
| 259 __m256i q0, q1, q2, q3, q4; |
| 260 q0 = _mm256_set1_epi32(0xffff & qlp_coe
ff[0 ]); |
| 261 q1 = _mm256_set1_epi32(0xffff & qlp_coe
ff[1 ]); |
| 262 q2 = _mm256_set1_epi32(0xffff & qlp_coe
ff[2 ]); |
| 263 q3 = _mm256_set1_epi32(0xffff & qlp_coe
ff[3 ]); |
| 264 q4 = _mm256_set1_epi32(0xffff & qlp_coe
ff[4 ]); |
| 265 |
| 266 for(i = 0; i < (int)data_len-7; i+=8) { |
| 267 __m256i summ, mull; |
| 268 summ = _mm256_madd_epi16(q4, _m
m256_loadu_si256((const __m256i*)(data+i-5 ))); |
| 269 mull = _mm256_madd_epi16(q3, _m
m256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 270 mull = _mm256_madd_epi16(q2, _m
m256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 271 mull = _mm256_madd_epi16(q1, _m
m256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 272 mull = _mm256_madd_epi16(q0, _m
m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 273 summ = _mm256_sra_epi32(summ, cn
t); |
| 274 _mm256_storeu_si256((__m256i*)(r
esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)
); |
| 275 } |
| 276 } |
| 277 } |
| 278 } |
| 279 else { |
| 280 if(order > 2) { |
| 281 if(order == 4) { |
| 282 __m256i q0, q1, q2, q3; |
| 283 q0 = _mm256_set1_epi32(0xffff & qlp_coe
ff[0 ]); |
| 284 q1 = _mm256_set1_epi32(0xffff & qlp_coe
ff[1 ]); |
| 285 q2 = _mm256_set1_epi32(0xffff & qlp_coe
ff[2 ]); |
| 286 q3 = _mm256_set1_epi32(0xffff & qlp_coe
ff[3 ]); |
| 287 |
| 288 for(i = 0; i < (int)data_len-7; i+=8) { |
| 289 __m256i summ, mull; |
| 290 summ = _mm256_madd_epi16(q3, _m
m256_loadu_si256((const __m256i*)(data+i-4 ))); |
| 291 mull = _mm256_madd_epi16(q2, _m
m256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 292 mull = _mm256_madd_epi16(q1, _m
m256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 293 mull = _mm256_madd_epi16(q0, _m
m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 294 summ = _mm256_sra_epi32(summ, cn
t); |
| 295 _mm256_storeu_si256((__m256i*)(r
esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)
); |
| 296 } |
| 297 } |
| 298 else { /* order == 3 */ |
| 299 __m256i q0, q1, q2; |
| 300 q0 = _mm256_set1_epi32(0xffff & qlp_coe
ff[0 ]); |
| 301 q1 = _mm256_set1_epi32(0xffff & qlp_coe
ff[1 ]); |
| 302 q2 = _mm256_set1_epi32(0xffff & qlp_coe
ff[2 ]); |
| 303 |
| 304 for(i = 0; i < (int)data_len-7; i+=8) { |
| 305 __m256i summ, mull; |
| 306 summ = _mm256_madd_epi16(q2, _m
m256_loadu_si256((const __m256i*)(data+i-3 ))); |
| 307 mull = _mm256_madd_epi16(q1, _m
m256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 308 mull = _mm256_madd_epi16(q0, _m
m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 309 summ = _mm256_sra_epi32(summ, cn
t); |
| 310 _mm256_storeu_si256((__m256i*)(r
esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)
); |
| 311 } |
| 312 } |
| 313 } |
| 314 else { |
| 315 if(order == 2) { |
| 316 __m256i q0, q1; |
| 317 q0 = _mm256_set1_epi32(0xffff & qlp_coe
ff[0 ]); |
| 318 q1 = _mm256_set1_epi32(0xffff & qlp_coe
ff[1 ]); |
| 319 |
| 320 for(i = 0; i < (int)data_len-7; i+=8) { |
| 321 __m256i summ, mull; |
| 322 summ = _mm256_madd_epi16(q1, _m
m256_loadu_si256((const __m256i*)(data+i-2 ))); |
| 323 mull = _mm256_madd_epi16(q0, _m
m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu
ll); |
| 324 summ = _mm256_sra_epi32(summ, cn
t); |
| 325 _mm256_storeu_si256((__m256i*)(r
esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)
); |
| 326 } |
| 327 } |
| 328 else { /* order == 1 */ |
| 329 __m256i q0; |
| 330 q0 = _mm256_set1_epi32(0xffff & qlp_coe
ff[0 ]); |
| 331 |
| 332 for(i = 0; i < (int)data_len-7; i+=8) { |
| 333 __m256i summ; |
| 334 summ = _mm256_madd_epi16(q0, _m
m256_loadu_si256((const __m256i*)(data+i-1 ))); |
| 335 summ = _mm256_sra_epi32(summ, cn
t); |
| 336 _mm256_storeu_si256((__m256i*)(r
esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)
); |
| 337 } |
| 338 } |
| 339 } |
| 340 } |
| 341 for(; i < (int)data_len; i++) { |
| 342 sum = 0; |
| 343 switch(order) { |
| 344 case 12: sum += qlp_coeff[11] * data[i-12]; |
| 345 case 11: sum += qlp_coeff[10] * data[i-11]; |
| 346 case 10: sum += qlp_coeff[ 9] * data[i-10]; |
| 347 case 9: sum += qlp_coeff[ 8] * data[i- 9]; |
| 348 case 8: sum += qlp_coeff[ 7] * data[i- 8]; |
| 349 case 7: sum += qlp_coeff[ 6] * data[i- 7]; |
| 350 case 6: sum += qlp_coeff[ 5] * data[i- 6]; |
| 351 case 5: sum += qlp_coeff[ 4] * data[i- 5]; |
| 352 case 4: sum += qlp_coeff[ 3] * data[i- 4]; |
| 353 case 3: sum += qlp_coeff[ 2] * data[i- 3]; |
| 354 case 2: sum += qlp_coeff[ 1] * data[i- 2]; |
| 355 case 1: sum += qlp_coeff[ 0] * data[i- 1]; |
| 356 } |
| 357 residual[i] = data[i] - (sum >> lp_quantization); |
| 358 } |
| 359 } |
| 360 else { /* order > 12 */ |
| 361 for(i = 0; i < (int)data_len; i++) { |
| 362 sum = 0; |
| 363 switch(order) { |
| 364 case 32: sum += qlp_coeff[31] * data[i-32]; |
| 365 case 31: sum += qlp_coeff[30] * data[i-31]; |
| 366 case 30: sum += qlp_coeff[29] * data[i-30]; |
| 367 case 29: sum += qlp_coeff[28] * data[i-29]; |
| 368 case 28: sum += qlp_coeff[27] * data[i-28]; |
| 369 case 27: sum += qlp_coeff[26] * data[i-27]; |
| 370 case 26: sum += qlp_coeff[25] * data[i-26]; |
| 371 case 25: sum += qlp_coeff[24] * data[i-25]; |
| 372 case 24: sum += qlp_coeff[23] * data[i-24]; |
| 373 case 23: sum += qlp_coeff[22] * data[i-23]; |
| 374 case 22: sum += qlp_coeff[21] * data[i-22]; |
| 375 case 21: sum += qlp_coeff[20] * data[i-21]; |
| 376 case 20: sum += qlp_coeff[19] * data[i-20]; |
| 377 case 19: sum += qlp_coeff[18] * data[i-19]; |
| 378 case 18: sum += qlp_coeff[17] * data[i-18]; |
| 379 case 17: sum += qlp_coeff[16] * data[i-17]; |
| 380 case 16: sum += qlp_coeff[15] * data[i-16]; |
| 381 case 15: sum += qlp_coeff[14] * data[i-15]; |
| 382 case 14: sum += qlp_coeff[13] * data[i-14]; |
| 383 case 13: sum += qlp_coeff[12] * data[i-13]; |
| 384 sum += qlp_coeff[11] * data[i-12]; |
| 385 sum += qlp_coeff[10] * data[i-11]; |
| 386 sum += qlp_coeff[ 9] * data[i-10]; |
| 387 sum += qlp_coeff[ 8] * data[i- 9]; |
| 388 sum += qlp_coeff[ 7] * data[i- 8]; |
| 389 sum += qlp_coeff[ 6] * data[i- 7]; |
| 390 sum += qlp_coeff[ 5] * data[i- 6]; |
| 391 sum += qlp_coeff[ 4] * data[i- 5]; |
| 392 sum += qlp_coeff[ 3] * data[i- 4]; |
| 393 sum += qlp_coeff[ 2] * data[i- 3]; |
| 394 sum += qlp_coeff[ 1] * data[i- 2]; |
| 395 sum += qlp_coeff[ 0] * data[i- 1]; |
| 396 } |
| 397 residual[i] = data[i] - (sum >> lp_quantization); |
| 398 } |
| 399 } |
| 400 _mm256_zeroupper(); |
| 401 } |
| 402 |
| 403 FLAC__SSE_TARGET("avx2") |
| 404 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__in
t32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int
lp_quantization, FLAC__int32 residual[]) |
| 405 { |
| 406 int i; |
| 407 FLAC__int32 sum; |
| 408 __m128i cnt = _mm_cvtsi32_si128(lp_quantization); |
| 409 |
| 410 FLAC__ASSERT(order > 0); |
| 411 FLAC__ASSERT(order <= 32); |
| 412 |
| 413 if(order <= 12) { |
| 414 if(order > 8) { |
| 415 if(order > 10) { |
| 416 if(order == 12) { |
| 417 __m256i q0, q1, q2, q3, q4, q5, q6, q7,
q8, q9, q10, q11; |
| 418 q0 = _mm256_set1_epi32(qlp_coeff[0 ]); |
| 419 q1 = _mm256_set1_epi32(qlp_coeff[1 ]); |
| 420 q2 = _mm256_set1_epi32(qlp_coeff[2 ]); |
| 421 q3 = _mm256_set1_epi32(qlp_coeff[3 ]); |
| 422 q4 = _mm256_set1_epi32(qlp_coeff[4 ]); |
| 423 q5 = _mm256_set1_epi32(qlp_coeff[5 ]); |
| 424 q6 = _mm256_set1_epi32(qlp_coeff[6 ]); |
| 425 q7 = _mm256_set1_epi32(qlp_coeff[7 ]); |
| 426 q8 = _mm256_set1_epi32(qlp_coeff[8 ]); |
| 427 q9 = _mm256_set1_epi32(qlp_coeff[9 ]); |
| 428 q10 = _mm256_set1_epi32(qlp_coeff[10]); |
| 429 q11 = _mm256_set1_epi32(qlp_coeff[11]); |
| 430 |
| 431 for(i = 0; i < (int)data_len-7; i+=8) { |
| 432 __m256i summ, mull; |
| 433 summ = _mm256_mullo_epi32(q11, _
mm256_loadu_si256((const __m256i*)(data+i-12))); |
| 434 mull = _mm256_mullo_epi32(q10, _
mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, m
ull); |
| 435 mull = _mm256_mullo_epi32(q9, _
mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, m
ull); |
| 436 mull = _mm256_mullo_epi32(q8, _
mm256_loadu_si256((const __m256i*)(data+i-9))); summ = _mm256_add_epi32(summ, m
ull); |
| 437 mull = _mm256_mullo_epi32(q7, _
mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, m
ull); |
| 438 mull = _mm256_mullo_epi32(q6, _
mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, m
ull); |
| 439 mull = _mm256_mullo_epi32(q5, _
mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, m
ull); |
| 440 mull = _mm256_mullo_epi32(q4, _
mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, m
ull); |
| 441 mull = _mm256_mullo_epi32(q3, _
mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, m
ull); |
| 442 mull = _mm256_mullo_epi32(q2, _
mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, m
ull); |
| 443 mull = _mm256_mullo_epi32(q1, _
mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, m
ull); |
| 444 mull = _mm256_mullo_epi32(q0, _
mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m
ull); |
| 445 summ = _mm256_sra_epi32(summ, cn
t); |
| 446 _mm256_storeu_si256((__m256i*)(r
esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)
); |
| 447 } |
| 448 } |
| 449 else { /* order == 11 */ |
| 450 __m256i q0, q1, q2, q3, q4, q5, q6, q7,
q8, q9, q10; |
| 451 q0 = _mm256_set1_epi32(qlp_coeff[0 ]); |
| 452 q1 = _mm256_set1_epi32(qlp_coeff[1 ]); |
| 453 q2 = _mm256_set1_epi32(qlp_coeff[2 ]); |
| 454 q3 = _mm256_set1_epi32(qlp_coeff[3 ]); |
| 455 q4 = _mm256_set1_epi32(qlp_coeff[4 ]); |
| 456 q5 = _mm256_set1_epi32(qlp_coeff[5 ]); |
| 457 q6 = _mm256_set1_epi32(qlp_coeff[6 ]); |
| 458 q7 = _mm256_set1_epi32(qlp_coeff[7 ]); |
| 459 q8 = _mm256_set1_epi32(qlp_coeff[8 ]); |
| 460 q9 = _mm256_set1_epi32(qlp_coeff[9 ]); |
| 461 q10 = _mm256_set1_epi32(qlp_coeff[10]); |
| 462 |
| 463 for(i = 0; i < (int)data_len-7; i+=8) { |
| 464 __m256i summ, mull; |
| 465 summ = _mm256_mullo_epi32(q10, _
mm256_loadu_si256((const __m256i*)(data+i-11))); |
| 466 mull = _mm256_mullo_epi32(q9, _
mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, m
ull); |
| 467 mull = _mm256_mullo_epi32(q8, _
mm256_loadu_si256((const __m256i*)(data+i-9))); summ = _mm256_add_epi32(summ, m
ull); |
| 468 mull = _mm256_mullo_epi32(q7, _
mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, m
ull); |
| 469 mull = _mm256_mullo_epi32(q6, _
mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, m
ull); |
| 470 mull = _mm256_mullo_epi32(q5, _
mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, m
ull); |
| 471 mull = _mm256_mullo_epi32(q4, _
mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, m
ull); |
| 472 mull = _mm256_mullo_epi32(q3, _
mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, m
ull); |
| 473 mull = _mm256_mullo_epi32(q2, _
mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, m
ull); |
| 474 mull = _mm256_mullo_epi32(q1, _
mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, m
ull); |
| 475 mull = _mm256_mullo_epi32(q0, _
mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m
ull); |
| 476 summ = _mm256_sra_epi32(summ, cn
t); |
| 477 _mm256_storeu_si256((__m256i*)(r
esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)
); |
| 478 } |
| 479 } |
| 480 } |
| 481 else { |
| 482 if(order == 10) { |
| 483 __m256i q0, q1, q2, q3, q4, q5, q6, q7,
q8, q9; |
| 484 q0 = _mm256_set1_epi32(qlp_coeff[0 ]); |
| 485 q1 = _mm256_set1_epi32(qlp_coeff[1 ]); |
| 486 q2 = _mm256_set1_epi32(qlp_coeff[2 ]); |
| 487 q3 = _mm256_set1_epi32(qlp_coeff[3 ]); |
| 488 q4 = _mm256_set1_epi32(qlp_coeff[4 ]); |
| 489 q5 = _mm256_set1_epi32(qlp_coeff[5 ]); |
| 490 q6 = _mm256_set1_epi32(qlp_coeff[6 ]); |
| 491 q7 = _mm256_set1_epi32(qlp_coeff[7 ]); |
| 492 q8 = _mm256_set1_epi32(qlp_coeff[8 ]); |
| 493 q9 = _mm256_set1_epi32(qlp_coeff[9 ]); |
| 494 |
| 495 for(i = 0; i < (int)data_len-7; i+=8) { |
| 496 __m256i summ, mull; |
| 497 summ = _mm256_mullo_epi32(q9, _
mm256_loadu_si256((const __m256i*)(data+i-10))); |
| 498 mull = _mm256_mullo_epi32(q8, _
mm256_loadu_si256((const __m256i*)(data+i-9))); summ = _mm256_add_epi32(summ, m
ull); |
| 499 mull = _mm256_mullo_epi32(q7, _
mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, m
ull); |
| 500 mull = _mm256_mullo_epi32(q6, _
mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, m
ull); |
| 501 mull = _mm256_mullo_epi32(q5, _
mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, m
ull); |
| 502 mull = _mm256_mullo_epi32(q4, _
mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, m
ull); |
| 503 mull = _mm256_mullo_epi32(q3, _
mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, m
ull); |
| 504 mull = _mm256_mullo_epi32(q2, _
mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, m
ull); |
| 505 mull = _mm256_mullo_epi32(q1, _
mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, m
ull); |
| 506 mull = _mm256_mullo_epi32(q0, _
mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m
ull); |
| 507 summ = _mm256_sra_epi32(summ, cn
t); |
| 508 _mm256_storeu_si256((__m256i*)(r
esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)
); |
| 509 } |
| 510 } |
| 511 else { /* order == 9 */ |
| 512 __m256i q0, q1, q2, q3, q4, q5, q6, q7,
q8; |
| 513 q0 = _mm256_set1_epi32(qlp_coeff[0 ]); |
| 514 q1 = _mm256_set1_epi32(qlp_coeff[1 ]); |
| 515 q2 = _mm256_set1_epi32(qlp_coeff[2 ]); |
| 516 q3 = _mm256_set1_epi32(qlp_coeff[3 ]); |
| 517 q4 = _mm256_set1_epi32(qlp_coeff[4 ]); |
| 518 q5 = _mm256_set1_epi32(qlp_coeff[5 ]); |
| 519 q6 = _mm256_set1_epi32(qlp_coeff[6 ]); |
| 520 q7 = _mm256_set1_epi32(qlp_coeff[7 ]); |
| 521 q8 = _mm256_set1_epi32(qlp_coeff[8 ]); |
| 522 |
| 523 for(i = 0; i < (int)data_len-7; i+=8) { |
| 524 __m256i summ, mull; |
| 525 summ = _mm256_mullo_epi32(q8, _
mm256_loadu_si256((const __m256i*)(data+i-9))); |
| 526 mull = _mm256_mullo_epi32(q7, _
mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, m
ull); |
| 527 mull = _mm256_mullo_epi32(q6, _
mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, m
ull); |
| 528 mull = _mm256_mullo_epi32(q5, _
mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, m
ull); |
| 529 mull = _mm256_mullo_epi32(q4, _
mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, m
ull); |
| 530 mull = _mm256_mullo_epi32(q3, _
mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, m
ull); |
| 531 mull = _mm256_mullo_epi32(q2, _
mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, m
ull); |
| 532 mull = _mm256_mullo_epi32(q1, _
mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, m
ull); |
| 533 mull = _mm256_mullo_epi32(q0, _
mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m
ull); |
| 534 summ = _mm256_sra_epi32(summ, cn
t); |
| 535 _mm256_storeu_si256((__m256i*)(r
esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)
); |
| 536 } |
| 537 } |
| 538 } |
| 539 } |
| 540 else if(order > 4) { |
| 541 if(order > 6) { |
| 542 if(order == 8) { |
| 543 __m256i q0, q1, q2, q3, q4, q5, q6, q7; |
| 544 q0 = _mm256_set1_epi32(qlp_coeff[0 ]); |
| 545 q1 = _mm256_set1_epi32(qlp_coeff[1 ]); |
| 546 q2 = _mm256_set1_epi32(qlp_coeff[2 ]); |
| 547 q3 = _mm256_set1_epi32(qlp_coeff[3 ]); |
| 548 q4 = _mm256_set1_epi32(qlp_coeff[4 ]); |
| 549 q5 = _mm256_set1_epi32(qlp_coeff[5 ]); |
| 550 q6 = _mm256_set1_epi32(qlp_coeff[6 ]); |
| 551 q7 = _mm256_set1_epi32(qlp_coeff[7 ]); |
| 552 |
| 553 for(i = 0; i < (int)data_len-7; i+=8) { |
| 554 __m256i summ, mull; |
| 555 summ = _mm256_mullo_epi32(q7, _
mm256_loadu_si256((const __m256i*)(data+i-8))); |
| 556 mull = _mm256_mullo_epi32(q6, _
mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, m
ull); |
| 557 mull = _mm256_mullo_epi32(q5, _
mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, m
ull); |
| 558 mull = _mm256_mullo_epi32(q4, _
mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, m
ull); |
| 559 mull = _mm256_mullo_epi32(q3, _
mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, m
ull); |
| 560 mull = _mm256_mullo_epi32(q2, _
mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, m
ull); |
| 561 mull = _mm256_mullo_epi32(q1, _
mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, m
ull); |
| 562 mull = _mm256_mullo_epi32(q0, _
mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m
ull); |
| 563 summ = _mm256_sra_epi32(summ, cn
t); |
| 564 _mm256_storeu_si256((__m256i*)(r
esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)
); |
| 565 } |
| 566 } |
| 567 else { /* order == 7 */ |
| 568 __m256i q0, q1, q2, q3, q4, q5, q6; |
| 569 q0 = _mm256_set1_epi32(qlp_coeff[0 ]); |
| 570 q1 = _mm256_set1_epi32(qlp_coeff[1 ]); |
| 571 q2 = _mm256_set1_epi32(qlp_coeff[2 ]); |
| 572 q3 = _mm256_set1_epi32(qlp_coeff[3 ]); |
| 573 q4 = _mm256_set1_epi32(qlp_coeff[4 ]); |
| 574 q5 = _mm256_set1_epi32(qlp_coeff[5 ]); |
| 575 q6 = _mm256_set1_epi32(qlp_coeff[6 ]); |
| 576 |
| 577 for(i = 0; i < (int)data_len-7; i+=8) { |
| 578 __m256i summ, mull; |
| 579 summ = _mm256_mullo_epi32(q6, _
mm256_loadu_si256((const __m256i*)(data+i-7))); |
| 580 mull = _mm256_mullo_epi32(q5, _
mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, m
ull); |
| 581 mull = _mm256_mullo_epi32(q4, _
mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, m
ull); |
| 582 mull = _mm256_mullo_epi32(q3, _
mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, m
ull); |
| 583 mull = _mm256_mullo_epi32(q2, _
mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, m
ull); |
| 584 mull = _mm256_mullo_epi32(q1, _
mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, m
ull); |
| 585 mull = _mm256_mullo_epi32(q0, _
mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m
ull); |
| 586 summ = _mm256_sra_epi32(summ, cn
t); |
| 587 _mm256_storeu_si256((__m256i*)(r
esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)
); |
| 588 } |
| 589 } |
| 590 } |
| 591 else { |
| 592 if(order == 6) { |
| 593 __m256i q0, q1, q2, q3, q4, q5; |
| 594 q0 = _mm256_set1_epi32(qlp_coeff[0 ]); |
| 595 q1 = _mm256_set1_epi32(qlp_coeff[1 ]); |
| 596 q2 = _mm256_set1_epi32(qlp_coeff[2 ]); |
| 597 q3 = _mm256_set1_epi32(qlp_coeff[3 ]); |
| 598 q4 = _mm256_set1_epi32(qlp_coeff[4 ]); |
| 599 q5 = _mm256_set1_epi32(qlp_coeff[5 ]); |
| 600 |
| 601 for(i = 0; i < (int)data_len-7; i+=8) { |
| 602 __m256i summ, mull; |
| 603 summ = _mm256_mullo_epi32(q5, _
mm256_loadu_si256((const __m256i*)(data+i-6))); |
| 604 mull = _mm256_mullo_epi32(q4, _
mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, m
ull); |
| 605 mull = _mm256_mullo_epi32(q3, _
mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, m
ull); |
| 606 mull = _mm256_mullo_epi32(q2, _
mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, m
ull); |
| 607 mull = _mm256_mullo_epi32(q1, _
mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, m
ull); |
| 608 mull = _mm256_mullo_epi32(q0, _
mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m
ull); |
| 609 summ = _mm256_sra_epi32(summ, cn
t); |
| 610 _mm256_storeu_si256((__m256i*)(r
esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)
); |
| 611 } |
| 612 } |
| 613 else { /* order == 5 */ |
| 614 __m256i q0, q1, q2, q3, q4; |
| 615 q0 = _mm256_set1_epi32(qlp_coeff[0 ]); |
| 616 q1 = _mm256_set1_epi32(qlp_coeff[1 ]); |
| 617 q2 = _mm256_set1_epi32(qlp_coeff[2 ]); |
| 618 q3 = _mm256_set1_epi32(qlp_coeff[3 ]); |
| 619 q4 = _mm256_set1_epi32(qlp_coeff[4 ]); |
| 620 |
| 621 for(i = 0; i < (int)data_len-7; i+=8) { |
| 622 __m256i summ, mull; |
| 623 summ = _mm256_mullo_epi32(q4, _
mm256_loadu_si256((const __m256i*)(data+i-5))); |
| 624 mull = _mm256_mullo_epi32(q3, _
mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, m
ull); |
| 625 mull = _mm256_mullo_epi32(q2, _
mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, m
ull); |
| 626 mull = _mm256_mullo_epi32(q1, _
mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, m
ull); |
| 627 mull = _mm256_mullo_epi32(q0, _
mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m
ull); |
| 628 summ = _mm256_sra_epi32(summ, cn
t); |
| 629 _mm256_storeu_si256((__m256i*)(r
esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)
); |
| 630 } |
| 631 } |
| 632 } |
| 633 } |
| 634 else { |
| 635 if(order > 2) { |
| 636 if(order == 4) { |
| 637 __m256i q0, q1, q2, q3; |
| 638 q0 = _mm256_set1_epi32(qlp_coeff[0 ]); |
| 639 q1 = _mm256_set1_epi32(qlp_coeff[1 ]); |
| 640 q2 = _mm256_set1_epi32(qlp_coeff[2 ]); |
| 641 q3 = _mm256_set1_epi32(qlp_coeff[3 ]); |
| 642 |
| 643 for(i = 0; i < (int)data_len-7; i+=8) { |
| 644 __m256i summ, mull; |
| 645 summ = _mm256_mullo_epi32(q3, _
mm256_loadu_si256((const __m256i*)(data+i-4))); |
| 646 mull = _mm256_mullo_epi32(q2, _
mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, m
ull); |
| 647 mull = _mm256_mullo_epi32(q1, _
mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, m
ull); |
| 648 mull = _mm256_mullo_epi32(q0, _
mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m
ull); |
| 649 summ = _mm256_sra_epi32(summ, cn
t); |
| 650 _mm256_storeu_si256((__m256i*)(r
esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)
); |
| 651 } |
| 652 } |
| 653 else { /* order == 3 */ |
| 654 __m256i q0, q1, q2; |
| 655 q0 = _mm256_set1_epi32(qlp_coeff[0 ]); |
| 656 q1 = _mm256_set1_epi32(qlp_coeff[1 ]); |
| 657 q2 = _mm256_set1_epi32(qlp_coeff[2 ]); |
| 658 |
| 659 for(i = 0; i < (int)data_len-7; i+=8) { |
| 660 __m256i summ, mull; |
| 661 summ = _mm256_mullo_epi32(q2, _
mm256_loadu_si256((const __m256i*)(data+i-3))); |
| 662 mull = _mm256_mullo_epi32(q1, _
mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, m
ull); |
| 663 mull = _mm256_mullo_epi32(q0, _
mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m
ull); |
| 664 summ = _mm256_sra_epi32(summ, cn
t); |
| 665 _mm256_storeu_si256((__m256i*)(r
esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)
); |
| 666 } |
| 667 } |
| 668 } |
| 669 else { |
| 670 if(order == 2) { |
| 671 __m256i q0, q1; |
| 672 q0 = _mm256_set1_epi32(qlp_coeff[0 ]); |
| 673 q1 = _mm256_set1_epi32(qlp_coeff[1 ]); |
| 674 |
| 675 for(i = 0; i < (int)data_len-7; i+=8) { |
| 676 __m256i summ, mull; |
| 677 summ = _mm256_mullo_epi32(q1, _
mm256_loadu_si256((const __m256i*)(data+i-2))); |
| 678 mull = _mm256_mullo_epi32(q0, _
mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m
ull); |
| 679 summ = _mm256_sra_epi32(summ, cn
t); |
| 680 _mm256_storeu_si256((__m256i*)(r
esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)
); |
| 681 } |
| 682 } |
| 683 else { /* order == 1 */ |
| 684 __m256i q0; |
| 685 q0 = _mm256_set1_epi32(qlp_coeff[0 ]); |
| 686 |
| 687 for(i = 0; i < (int)data_len-7; i+=8) { |
| 688 __m256i summ; |
| 689 summ = _mm256_mullo_epi32(q0, _
mm256_loadu_si256((const __m256i*)(data+i-1))); |
| 690 summ = _mm256_sra_epi32(summ, cn
t); |
| 691 _mm256_storeu_si256((__m256i*)(r
esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)
); |
| 692 } |
| 693 } |
| 694 } |
| 695 } |
| 696 for(; i < (int)data_len; i++) { |
| 697 sum = 0; |
| 698 switch(order) { |
| 699 case 12: sum += qlp_coeff[11] * data[i-12]; |
| 700 case 11: sum += qlp_coeff[10] * data[i-11]; |
| 701 case 10: sum += qlp_coeff[ 9] * data[i-10]; |
| 702 case 9: sum += qlp_coeff[ 8] * data[i- 9]; |
| 703 case 8: sum += qlp_coeff[ 7] * data[i- 8]; |
| 704 case 7: sum += qlp_coeff[ 6] * data[i- 7]; |
| 705 case 6: sum += qlp_coeff[ 5] * data[i- 6]; |
| 706 case 5: sum += qlp_coeff[ 4] * data[i- 5]; |
| 707 case 4: sum += qlp_coeff[ 3] * data[i- 4]; |
| 708 case 3: sum += qlp_coeff[ 2] * data[i- 3]; |
| 709 case 2: sum += qlp_coeff[ 1] * data[i- 2]; |
| 710 case 1: sum += qlp_coeff[ 0] * data[i- 1]; |
| 711 } |
| 712 residual[i] = data[i] - (sum >> lp_quantization); |
| 713 } |
| 714 } |
| 715 else { /* order > 12 */ |
| 716 for(i = 0; i < (int)data_len; i++) { |
| 717 sum = 0; |
| 718 switch(order) { |
| 719 case 32: sum += qlp_coeff[31] * data[i-32]; |
| 720 case 31: sum += qlp_coeff[30] * data[i-31]; |
| 721 case 30: sum += qlp_coeff[29] * data[i-30]; |
| 722 case 29: sum += qlp_coeff[28] * data[i-29]; |
| 723 case 28: sum += qlp_coeff[27] * data[i-28]; |
| 724 case 27: sum += qlp_coeff[26] * data[i-27]; |
| 725 case 26: sum += qlp_coeff[25] * data[i-26]; |
| 726 case 25: sum += qlp_coeff[24] * data[i-25]; |
| 727 case 24: sum += qlp_coeff[23] * data[i-24]; |
| 728 case 23: sum += qlp_coeff[22] * data[i-23]; |
| 729 case 22: sum += qlp_coeff[21] * data[i-22]; |
| 730 case 21: sum += qlp_coeff[20] * data[i-21]; |
| 731 case 20: sum += qlp_coeff[19] * data[i-20]; |
| 732 case 19: sum += qlp_coeff[18] * data[i-19]; |
| 733 case 18: sum += qlp_coeff[17] * data[i-18]; |
| 734 case 17: sum += qlp_coeff[16] * data[i-17]; |
| 735 case 16: sum += qlp_coeff[15] * data[i-16]; |
| 736 case 15: sum += qlp_coeff[14] * data[i-15]; |
| 737 case 14: sum += qlp_coeff[13] * data[i-14]; |
| 738 case 13: sum += qlp_coeff[12] * data[i-13]; |
| 739 sum += qlp_coeff[11] * data[i-12]; |
| 740 sum += qlp_coeff[10] * data[i-11]; |
| 741 sum += qlp_coeff[ 9] * data[i-10]; |
| 742 sum += qlp_coeff[ 8] * data[i- 9]; |
| 743 sum += qlp_coeff[ 7] * data[i- 8]; |
| 744 sum += qlp_coeff[ 6] * data[i- 7]; |
| 745 sum += qlp_coeff[ 5] * data[i- 6]; |
| 746 sum += qlp_coeff[ 4] * data[i- 5]; |
| 747 sum += qlp_coeff[ 3] * data[i- 4]; |
| 748 sum += qlp_coeff[ 2] * data[i- 3]; |
| 749 sum += qlp_coeff[ 1] * data[i- 2]; |
| 750 sum += qlp_coeff[ 0] * data[i- 1]; |
| 751 } |
| 752 residual[i] = data[i] - (sum >> lp_quantization); |
| 753 } |
| 754 } |
| 755 _mm256_zeroupper(); |
| 756 } |
| 757 |
| 758 static FLAC__int32 pack_arr[8] = { 0, 2, 4, 6, 1, 3, 5, 7 }; |
| 759 |
| 760 FLAC__SSE_TARGET("avx2") |
| 761 void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA
C__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order
, int lp_quantization, FLAC__int32 residual[]) |
| 762 { |
| 763 int i; |
| 764 FLAC__int64 sum; |
| 765 __m128i cnt = _mm_cvtsi32_si128(lp_quantization); |
| 766 __m256i pack = _mm256_loadu_si256((const __m256i *)pack_arr); |
| 767 |
| 768 FLAC__ASSERT(order > 0); |
| 769 FLAC__ASSERT(order <= 32); |
| 770 FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm256_sra_epi64() so
we have to use _mm256_srl_epi64() */ |
| 771 |
| 772 if(order <= 12) { |
| 773 if(order > 8) { |
| 774 if(order > 10) { |
| 775 if(order == 12) { |
| 776 __m256i q0, q1, q2, q3, q4, q5, q6, q7,
q8, q9, q10, q11; |
| 777 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[0 ])); |
| 778 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[1 ])); |
| 779 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[2 ])); |
| 780 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[3 ])); |
| 781 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[4 ])); |
| 782 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[5 ])); |
| 783 q6 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[6 ])); |
| 784 q7 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[7 ])); |
| 785 q8 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[8 ])); |
| 786 q9 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[9 ])); |
| 787 q10 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[10])); |
| 788 q11 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[11])); |
| 789 |
| 790 for(i = 0; i < (int)data_len-3; i+=4) { |
| 791 __m256i summ, mull; |
| 792 summ = _mm256_mul_epi32(q11, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-12)))); |
| 793 mull = _mm256_mul_epi32(q10, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11)))); summ = _mm256
_add_epi64(summ, mull); |
| 794 mull = _mm256_mul_epi32(q9, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256
_add_epi64(summ, mull); |
| 795 mull = _mm256_mul_epi32(q8, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256
_add_epi64(summ, mull); |
| 796 mull = _mm256_mul_epi32(q7, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256
_add_epi64(summ, mull); |
| 797 mull = _mm256_mul_epi32(q6, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256
_add_epi64(summ, mull); |
| 798 mull = _mm256_mul_epi32(q5, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256
_add_epi64(summ, mull); |
| 799 mull = _mm256_mul_epi32(q4, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256
_add_epi64(summ, mull); |
| 800 mull = _mm256_mul_epi32(q3, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256
_add_epi64(summ, mull); |
| 801 mull = _mm256_mul_epi32(q2, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256
_add_epi64(summ, mull); |
| 802 mull = _mm256_mul_epi32(q1, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256
_add_epi64(summ, mull); |
| 803 mull = _mm256_mul_epi32(q0, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256
_add_epi64(summ, mull); |
| 804 summ = _mm256_permutevar8x32_epi
32(_mm256_srl_epi64(summ, cnt), pack); |
| 805 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi2
56_si128(summ))); |
| 806 } |
| 807 } |
| 808 else { /* order == 11 */ |
| 809 __m256i q0, q1, q2, q3, q4, q5, q6, q7,
q8, q9, q10; |
| 810 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[0 ])); |
| 811 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[1 ])); |
| 812 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[2 ])); |
| 813 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[3 ])); |
| 814 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[4 ])); |
| 815 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[5 ])); |
| 816 q6 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[6 ])); |
| 817 q7 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[7 ])); |
| 818 q8 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[8 ])); |
| 819 q9 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[9 ])); |
| 820 q10 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[10])); |
| 821 |
| 822 for(i = 0; i < (int)data_len-3; i+=4) { |
| 823 __m256i summ, mull; |
| 824 summ = _mm256_mul_epi32(q10, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11)))); |
| 825 mull = _mm256_mul_epi32(q9, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256
_add_epi64(summ, mull); |
| 826 mull = _mm256_mul_epi32(q8, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256
_add_epi64(summ, mull); |
| 827 mull = _mm256_mul_epi32(q7, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256
_add_epi64(summ, mull); |
| 828 mull = _mm256_mul_epi32(q6, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256
_add_epi64(summ, mull); |
| 829 mull = _mm256_mul_epi32(q5, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256
_add_epi64(summ, mull); |
| 830 mull = _mm256_mul_epi32(q4, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256
_add_epi64(summ, mull); |
| 831 mull = _mm256_mul_epi32(q3, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256
_add_epi64(summ, mull); |
| 832 mull = _mm256_mul_epi32(q2, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256
_add_epi64(summ, mull); |
| 833 mull = _mm256_mul_epi32(q1, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256
_add_epi64(summ, mull); |
| 834 mull = _mm256_mul_epi32(q0, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256
_add_epi64(summ, mull); |
| 835 summ = _mm256_permutevar8x32_epi
32(_mm256_srl_epi64(summ, cnt), pack); |
| 836 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi2
56_si128(summ))); |
| 837 } |
| 838 } |
| 839 } |
| 840 else { |
| 841 if(order == 10) { |
| 842 __m256i q0, q1, q2, q3, q4, q5, q6, q7,
q8, q9; |
| 843 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[0 ])); |
| 844 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[1 ])); |
| 845 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[2 ])); |
| 846 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[3 ])); |
| 847 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[4 ])); |
| 848 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[5 ])); |
| 849 q6 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[6 ])); |
| 850 q7 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[7 ])); |
| 851 q8 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[8 ])); |
| 852 q9 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[9 ])); |
| 853 |
| 854 for(i = 0; i < (int)data_len-3; i+=4) { |
| 855 __m256i summ, mull; |
| 856 summ = _mm256_mul_epi32(q9, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); |
| 857 mull = _mm256_mul_epi32(q8, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256
_add_epi64(summ, mull); |
| 858 mull = _mm256_mul_epi32(q7, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256
_add_epi64(summ, mull); |
| 859 mull = _mm256_mul_epi32(q6, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256
_add_epi64(summ, mull); |
| 860 mull = _mm256_mul_epi32(q5, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256
_add_epi64(summ, mull); |
| 861 mull = _mm256_mul_epi32(q4, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256
_add_epi64(summ, mull); |
| 862 mull = _mm256_mul_epi32(q3, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256
_add_epi64(summ, mull); |
| 863 mull = _mm256_mul_epi32(q2, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256
_add_epi64(summ, mull); |
| 864 mull = _mm256_mul_epi32(q1, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256
_add_epi64(summ, mull); |
| 865 mull = _mm256_mul_epi32(q0, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256
_add_epi64(summ, mull); |
| 866 summ = _mm256_permutevar8x32_epi
32(_mm256_srl_epi64(summ, cnt), pack); |
| 867 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi2
56_si128(summ))); |
| 868 } |
| 869 } |
| 870 else { /* order == 9 */ |
| 871 __m256i q0, q1, q2, q3, q4, q5, q6, q7,
q8; |
| 872 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[0 ])); |
| 873 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[1 ])); |
| 874 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[2 ])); |
| 875 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[3 ])); |
| 876 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[4 ])); |
| 877 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[5 ])); |
| 878 q6 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[6 ])); |
| 879 q7 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[7 ])); |
| 880 q8 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[8 ])); |
| 881 |
| 882 for(i = 0; i < (int)data_len-3; i+=4) { |
| 883 __m256i summ, mull; |
| 884 summ = _mm256_mul_epi32(q8, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); |
| 885 mull = _mm256_mul_epi32(q7, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256
_add_epi64(summ, mull); |
| 886 mull = _mm256_mul_epi32(q6, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256
_add_epi64(summ, mull); |
| 887 mull = _mm256_mul_epi32(q5, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256
_add_epi64(summ, mull); |
| 888 mull = _mm256_mul_epi32(q4, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256
_add_epi64(summ, mull); |
| 889 mull = _mm256_mul_epi32(q3, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256
_add_epi64(summ, mull); |
| 890 mull = _mm256_mul_epi32(q2, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256
_add_epi64(summ, mull); |
| 891 mull = _mm256_mul_epi32(q1, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256
_add_epi64(summ, mull); |
| 892 mull = _mm256_mul_epi32(q0, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256
_add_epi64(summ, mull); |
| 893 summ = _mm256_permutevar8x32_epi
32(_mm256_srl_epi64(summ, cnt), pack); |
| 894 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi2
56_si128(summ))); |
| 895 } |
| 896 } |
| 897 } |
| 898 } |
| 899 else if(order > 4) { |
| 900 if(order > 6) { |
| 901 if(order == 8) { |
| 902 __m256i q0, q1, q2, q3, q4, q5, q6, q7; |
| 903 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[0 ])); |
| 904 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[1 ])); |
| 905 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[2 ])); |
| 906 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[3 ])); |
| 907 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[4 ])); |
| 908 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[5 ])); |
| 909 q6 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[6 ])); |
| 910 q7 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[7 ])); |
| 911 |
| 912 for(i = 0; i < (int)data_len-3; i+=4) { |
| 913 __m256i summ, mull; |
| 914 summ = _mm256_mul_epi32(q7, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); |
| 915 mull = _mm256_mul_epi32(q6, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256
_add_epi64(summ, mull); |
| 916 mull = _mm256_mul_epi32(q5, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256
_add_epi64(summ, mull); |
| 917 mull = _mm256_mul_epi32(q4, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256
_add_epi64(summ, mull); |
| 918 mull = _mm256_mul_epi32(q3, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256
_add_epi64(summ, mull); |
| 919 mull = _mm256_mul_epi32(q2, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256
_add_epi64(summ, mull); |
| 920 mull = _mm256_mul_epi32(q1, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256
_add_epi64(summ, mull); |
| 921 mull = _mm256_mul_epi32(q0, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256
_add_epi64(summ, mull); |
| 922 summ = _mm256_permutevar8x32_epi
32(_mm256_srl_epi64(summ, cnt), pack); |
| 923 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi2
56_si128(summ))); |
| 924 } |
| 925 } |
| 926 else { /* order == 7 */ |
| 927 __m256i q0, q1, q2, q3, q4, q5, q6; |
| 928 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[0 ])); |
| 929 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[1 ])); |
| 930 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[2 ])); |
| 931 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[3 ])); |
| 932 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[4 ])); |
| 933 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[5 ])); |
| 934 q6 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[6 ])); |
| 935 |
| 936 for(i = 0; i < (int)data_len-3; i+=4) { |
| 937 __m256i summ, mull; |
| 938 summ = _mm256_mul_epi32(q6, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); |
| 939 mull = _mm256_mul_epi32(q5, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256
_add_epi64(summ, mull); |
| 940 mull = _mm256_mul_epi32(q4, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256
_add_epi64(summ, mull); |
| 941 mull = _mm256_mul_epi32(q3, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256
_add_epi64(summ, mull); |
| 942 mull = _mm256_mul_epi32(q2, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256
_add_epi64(summ, mull); |
| 943 mull = _mm256_mul_epi32(q1, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256
_add_epi64(summ, mull); |
| 944 mull = _mm256_mul_epi32(q0, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256
_add_epi64(summ, mull); |
| 945 summ = _mm256_permutevar8x32_epi
32(_mm256_srl_epi64(summ, cnt), pack); |
| 946 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi2
56_si128(summ))); |
| 947 } |
| 948 } |
| 949 } |
| 950 else { |
| 951 if(order == 6) { |
| 952 __m256i q0, q1, q2, q3, q4, q5; |
| 953 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[0 ])); |
| 954 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[1 ])); |
| 955 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[2 ])); |
| 956 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[3 ])); |
| 957 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[4 ])); |
| 958 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[5 ])); |
| 959 |
| 960 for(i = 0; i < (int)data_len-3; i+=4) { |
| 961 __m256i summ, mull; |
| 962 summ = _mm256_mul_epi32(q5, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); |
| 963 mull = _mm256_mul_epi32(q4, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256
_add_epi64(summ, mull); |
| 964 mull = _mm256_mul_epi32(q3, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256
_add_epi64(summ, mull); |
| 965 mull = _mm256_mul_epi32(q2, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256
_add_epi64(summ, mull); |
| 966 mull = _mm256_mul_epi32(q1, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256
_add_epi64(summ, mull); |
| 967 mull = _mm256_mul_epi32(q0, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256
_add_epi64(summ, mull); |
| 968 summ = _mm256_permutevar8x32_epi
32(_mm256_srl_epi64(summ, cnt), pack); |
| 969 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi2
56_si128(summ))); |
| 970 } |
| 971 } |
| 972 else { /* order == 5 */ |
| 973 __m256i q0, q1, q2, q3, q4; |
| 974 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[0 ])); |
| 975 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[1 ])); |
| 976 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[2 ])); |
| 977 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[3 ])); |
| 978 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[4 ])); |
| 979 |
| 980 for(i = 0; i < (int)data_len-3; i+=4) { |
| 981 __m256i summ, mull; |
| 982 summ = _mm256_mul_epi32(q4, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); |
| 983 mull = _mm256_mul_epi32(q3, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256
_add_epi64(summ, mull); |
| 984 mull = _mm256_mul_epi32(q2, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256
_add_epi64(summ, mull); |
| 985 mull = _mm256_mul_epi32(q1, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256
_add_epi64(summ, mull); |
| 986 mull = _mm256_mul_epi32(q0, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256
_add_epi64(summ, mull); |
| 987 summ = _mm256_permutevar8x32_epi
32(_mm256_srl_epi64(summ, cnt), pack); |
| 988 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi2
56_si128(summ))); |
| 989 } |
| 990 } |
| 991 } |
| 992 } |
| 993 else { |
| 994 if(order > 2) { |
| 995 if(order == 4) { |
| 996 __m256i q0, q1, q2, q3; |
| 997 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[0 ])); |
| 998 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[1 ])); |
| 999 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[2 ])); |
| 1000 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[3 ])); |
| 1001 |
| 1002 for(i = 0; i < (int)data_len-3; i+=4) { |
| 1003 __m256i summ, mull; |
| 1004 summ = _mm256_mul_epi32(q3, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); |
| 1005 mull = _mm256_mul_epi32(q2, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256
_add_epi64(summ, mull); |
| 1006 mull = _mm256_mul_epi32(q1, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256
_add_epi64(summ, mull); |
| 1007 mull = _mm256_mul_epi32(q0, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256
_add_epi64(summ, mull); |
| 1008 summ = _mm256_permutevar8x32_epi
32(_mm256_srl_epi64(summ, cnt), pack); |
| 1009 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi2
56_si128(summ))); |
| 1010 } |
| 1011 } |
| 1012 else { /* order == 3 */ |
| 1013 __m256i q0, q1, q2; |
| 1014 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[0 ])); |
| 1015 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[1 ])); |
| 1016 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[2 ])); |
| 1017 |
| 1018 for(i = 0; i < (int)data_len-3; i+=4) { |
| 1019 __m256i summ, mull; |
| 1020 summ = _mm256_mul_epi32(q2, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); |
| 1021 mull = _mm256_mul_epi32(q1, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256
_add_epi64(summ, mull); |
| 1022 mull = _mm256_mul_epi32(q0, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256
_add_epi64(summ, mull); |
| 1023 summ = _mm256_permutevar8x32_epi
32(_mm256_srl_epi64(summ, cnt), pack); |
| 1024 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi2
56_si128(summ))); |
| 1025 } |
| 1026 } |
| 1027 } |
| 1028 else { |
| 1029 if(order == 2) { |
| 1030 __m256i q0, q1; |
| 1031 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[0 ])); |
| 1032 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[1 ])); |
| 1033 |
| 1034 for(i = 0; i < (int)data_len-3; i+=4) { |
| 1035 __m256i summ, mull; |
| 1036 summ = _mm256_mul_epi32(q1, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); |
| 1037 mull = _mm256_mul_epi32(q0, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256
_add_epi64(summ, mull); |
| 1038 summ = _mm256_permutevar8x32_epi
32(_mm256_srl_epi64(summ, cnt), pack); |
| 1039 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi2
56_si128(summ))); |
| 1040 } |
| 1041 } |
| 1042 else { /* order == 1 */ |
| 1043 __m256i q0; |
| 1044 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi
32(qlp_coeff[0 ])); |
| 1045 |
| 1046 for(i = 0; i < (int)data_len-3; i+=4) { |
| 1047 __m256i summ; |
| 1048 summ = _mm256_mul_epi32(q0, _mm
256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); |
| 1049 summ = _mm256_permutevar8x32_epi
32(_mm256_srl_epi64(summ, cnt), pack); |
| 1050 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi2
56_si128(summ))); |
| 1051 } |
| 1052 } |
| 1053 } |
| 1054 } |
| 1055 for(; i < (int)data_len; i++) { |
| 1056 sum = 0; |
| 1057 switch(order) { |
| 1058 case 12: sum += qlp_coeff[11] * (FLAC__int64)dat
a[i-12]; |
| 1059 case 11: sum += qlp_coeff[10] * (FLAC__int64)dat
a[i-11]; |
| 1060 case 10: sum += qlp_coeff[ 9] * (FLAC__int64)dat
a[i-10]; |
| 1061 case 9: sum += qlp_coeff[ 8] * (FLAC__int64)dat
a[i- 9]; |
| 1062 case 8: sum += qlp_coeff[ 7] * (FLAC__int64)dat
a[i- 8]; |
| 1063 case 7: sum += qlp_coeff[ 6] * (FLAC__int64)dat
a[i- 7]; |
| 1064 case 6: sum += qlp_coeff[ 5] * (FLAC__int64)dat
a[i- 6]; |
| 1065 case 5: sum += qlp_coeff[ 4] * (FLAC__int64)dat
a[i- 5]; |
| 1066 case 4: sum += qlp_coeff[ 3] * (FLAC__int64)dat
a[i- 4]; |
| 1067 case 3: sum += qlp_coeff[ 2] * (FLAC__int64)dat
a[i- 3]; |
| 1068 case 2: sum += qlp_coeff[ 1] * (FLAC__int64)dat
a[i- 2]; |
| 1069 case 1: sum += qlp_coeff[ 0] * (FLAC__int64)dat
a[i- 1]; |
| 1070 } |
| 1071 residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantiza
tion); |
| 1072 } |
| 1073 } |
| 1074 else { /* order > 12 */ |
| 1075 for(i = 0; i < (int)data_len; i++) { |
| 1076 sum = 0; |
| 1077 switch(order) { |
| 1078 case 32: sum += qlp_coeff[31] * (FLAC__int64)dat
a[i-32]; |
| 1079 case 31: sum += qlp_coeff[30] * (FLAC__int64)dat
a[i-31]; |
| 1080 case 30: sum += qlp_coeff[29] * (FLAC__int64)dat
a[i-30]; |
| 1081 case 29: sum += qlp_coeff[28] * (FLAC__int64)dat
a[i-29]; |
| 1082 case 28: sum += qlp_coeff[27] * (FLAC__int64)dat
a[i-28]; |
| 1083 case 27: sum += qlp_coeff[26] * (FLAC__int64)dat
a[i-27]; |
| 1084 case 26: sum += qlp_coeff[25] * (FLAC__int64)dat
a[i-26]; |
| 1085 case 25: sum += qlp_coeff[24] * (FLAC__int64)dat
a[i-25]; |
| 1086 case 24: sum += qlp_coeff[23] * (FLAC__int64)dat
a[i-24]; |
| 1087 case 23: sum += qlp_coeff[22] * (FLAC__int64)dat
a[i-23]; |
| 1088 case 22: sum += qlp_coeff[21] * (FLAC__int64)dat
a[i-22]; |
| 1089 case 21: sum += qlp_coeff[20] * (FLAC__int64)dat
a[i-21]; |
| 1090 case 20: sum += qlp_coeff[19] * (FLAC__int64)dat
a[i-20]; |
| 1091 case 19: sum += qlp_coeff[18] * (FLAC__int64)dat
a[i-19]; |
| 1092 case 18: sum += qlp_coeff[17] * (FLAC__int64)dat
a[i-18]; |
| 1093 case 17: sum += qlp_coeff[16] * (FLAC__int64)dat
a[i-17]; |
| 1094 case 16: sum += qlp_coeff[15] * (FLAC__int64)dat
a[i-16]; |
| 1095 case 15: sum += qlp_coeff[14] * (FLAC__int64)dat
a[i-15]; |
| 1096 case 14: sum += qlp_coeff[13] * (FLAC__int64)dat
a[i-14]; |
| 1097 case 13: sum += qlp_coeff[12] * (FLAC__int64)dat
a[i-13]; |
| 1098 sum += qlp_coeff[11] * (FLAC__int64)dat
a[i-12]; |
| 1099 sum += qlp_coeff[10] * (FLAC__int64)dat
a[i-11]; |
| 1100 sum += qlp_coeff[ 9] * (FLAC__int64)dat
a[i-10]; |
| 1101 sum += qlp_coeff[ 8] * (FLAC__int64)dat
a[i- 9]; |
| 1102 sum += qlp_coeff[ 7] * (FLAC__int64)dat
a[i- 8]; |
| 1103 sum += qlp_coeff[ 6] * (FLAC__int64)dat
a[i- 7]; |
| 1104 sum += qlp_coeff[ 5] * (FLAC__int64)dat
a[i- 6]; |
| 1105 sum += qlp_coeff[ 4] * (FLAC__int64)dat
a[i- 5]; |
| 1106 sum += qlp_coeff[ 3] * (FLAC__int64)dat
a[i- 4]; |
| 1107 sum += qlp_coeff[ 2] * (FLAC__int64)dat
a[i- 3]; |
| 1108 sum += qlp_coeff[ 1] * (FLAC__int64)dat
a[i- 2]; |
| 1109 sum += qlp_coeff[ 0] * (FLAC__int64)dat
a[i- 1]; |
| 1110 } |
| 1111 residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantiza
tion); |
| 1112 } |
| 1113 } |
| 1114 _mm256_zeroupper(); |
| 1115 } |
| 1116 |
| 1117 #endif /* FLAC__AVX2_SUPPORTED */ |
| 1118 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */ |
| 1119 #endif /* FLAC__NO_ASM */ |
| 1120 #endif /* FLAC__INTEGER_ONLY_LIBRARY */ |
OLD | NEW |