OLD | NEW |
(Empty) | |
| 1 /* libFLAC - Free Lossless Audio Codec library |
| 2 * Copyright (C) 2000-2009 Josh Coalson |
| 3 * Copyright (C) 2011-2014 Xiph.Org Foundation |
| 4 * |
| 5 * Redistribution and use in source and binary forms, with or without |
| 6 * modification, are permitted provided that the following conditions |
| 7 * are met: |
| 8 * |
| 9 * - Redistributions of source code must retain the above copyright |
| 10 * notice, this list of conditions and the following disclaimer. |
| 11 * |
| 12 * - Redistributions in binary form must reproduce the above copyright |
| 13 * notice, this list of conditions and the following disclaimer in the |
| 14 * documentation and/or other materials provided with the distribution. |
| 15 * |
| 16 * - Neither the name of the Xiph.org Foundation nor the names of its |
| 17 * contributors may be used to endorse or promote products derived from |
| 18 * this software without specific prior written permission. |
| 19 * |
| 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR |
| 24 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| 25 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| 26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| 27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
| 28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
| 29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| 30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 31 */ |
| 32 |
| 33 #ifdef HAVE_CONFIG_H |
| 34 # include <config.h> |
| 35 #endif |
| 36 |
| 37 #ifndef FLAC__INTEGER_ONLY_LIBRARY |
| 38 #ifndef FLAC__NO_ASM |
| 39 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X8
6INTRIN |
| 40 #include "private/lpc.h" |
| 41 #ifdef FLAC__SSE2_SUPPORTED |
| 42 |
| 43 #include "FLAC/assert.h" |
| 44 #include "FLAC/format.h" |
| 45 |
| 46 #include <emmintrin.h> /* SSE2 */ |
| 47 |
| 48 #define RESIDUAL16_RESULT(xmmN) curr = *data++; *residual++ = curr - (_mm_cvtsi1
28_si32(xmmN) >> lp_quantization); |
| 49 #define DATA16_RESULT(xmmN) curr = *residual++ + (_mm_cvtsi128_si32(xmmN) >>
lp_quantization); *data++ = curr; |
| 50 |
| 51 #define RESIDUAL32_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN)
>> lp_quantization); |
| 52 #define DATA32_RESULT(xmmN) data[i] = residual[i] + (_mm_cvtsi128_si32(xmmN)
>> lp_quantization); |
| 53 |
| 54 FLAC__SSE_TARGET("sse2") |
| 55 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
_int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order,
int lp_quantization, FLAC__int32 residual[]) |
| 56 { |
| 57 int i; |
| 58 FLAC__int32 sum; |
| 59 __m128i cnt = _mm_cvtsi32_si128(lp_quantization); |
| 60 |
| 61 FLAC__ASSERT(order > 0); |
| 62 FLAC__ASSERT(order <= 32); |
| 63 |
| 64 if(order <= 12) { |
| 65 if(order > 8) { |
| 66 if(order > 10) { |
| 67 if(order == 12) { |
| 68 __m128i q0, q1, q2, q3, q4, q5, q6, q7,
q8, q9, q10, q11; |
| 69 q0 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
| 70 q1 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
| 71 q2 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
| 72 q3 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
| 73 q4 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
| 74 q5 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); |
| 75 q6 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); |
| 76 q7 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); |
| 77 q8 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0)); |
| 78 q9 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0)); |
| 79 q10 = _mm_cvtsi32_si128(0xffff & qlp_coe
ff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0)); |
| 80 q11 = _mm_cvtsi32_si128(0xffff & qlp_coe
ff[11]); q11 = _mm_shuffle_epi32(q11, _MM_SHUFFLE(0,0,0,0)); |
| 81 |
| 82 for(i = 0; i < (int)data_len-3; i+=4) { |
| 83 __m128i summ, mull; |
| 84 summ = _mm_madd_epi16(q11, _mm_l
oadu_si128((const __m128i*)(data+i-12))); |
| 85 mull = _mm_madd_epi16(q10, _mm_l
oadu_si128((const __m128i*)(data+i-11))); summ = _mm_add_epi32(summ, mull); |
| 86 mull = _mm_madd_epi16(q9, _mm_lo
adu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull); |
| 87 mull = _mm_madd_epi16(q8, _mm_lo
adu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull); |
| 88 mull = _mm_madd_epi16(q7, _mm_lo
adu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull); |
| 89 mull = _mm_madd_epi16(q6, _mm_lo
adu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); |
| 90 mull = _mm_madd_epi16(q5, _mm_lo
adu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); |
| 91 mull = _mm_madd_epi16(q4, _mm_lo
adu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); |
| 92 mull = _mm_madd_epi16(q3, _mm_lo
adu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
| 93 mull = _mm_madd_epi16(q2, _mm_lo
adu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
| 94 mull = _mm_madd_epi16(q1, _mm_lo
adu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
| 95 mull = _mm_madd_epi16(q0, _mm_lo
adu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
| 96 summ = _mm_sra_epi32(summ, cnt); |
| 97 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
| 98 } |
| 99 } |
| 100 else { /* order == 11 */ |
| 101 __m128i q0, q1, q2, q3, q4, q5, q6, q7,
q8, q9, q10; |
| 102 q0 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
| 103 q1 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
| 104 q2 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
| 105 q3 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
| 106 q4 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
| 107 q5 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); |
| 108 q6 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); |
| 109 q7 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); |
| 110 q8 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0)); |
| 111 q9 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0)); |
| 112 q10 = _mm_cvtsi32_si128(0xffff & qlp_coe
ff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0)); |
| 113 |
| 114 for(i = 0; i < (int)data_len-3; i+=4) { |
| 115 __m128i summ, mull; |
| 116 summ = _mm_madd_epi16(q10, _mm_l
oadu_si128((const __m128i*)(data+i-11))); |
| 117 mull = _mm_madd_epi16(q9, _mm_lo
adu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull); |
| 118 mull = _mm_madd_epi16(q8, _mm_lo
adu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull); |
| 119 mull = _mm_madd_epi16(q7, _mm_lo
adu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull); |
| 120 mull = _mm_madd_epi16(q6, _mm_lo
adu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); |
| 121 mull = _mm_madd_epi16(q5, _mm_lo
adu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); |
| 122 mull = _mm_madd_epi16(q4, _mm_lo
adu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); |
| 123 mull = _mm_madd_epi16(q3, _mm_lo
adu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
| 124 mull = _mm_madd_epi16(q2, _mm_lo
adu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
| 125 mull = _mm_madd_epi16(q1, _mm_lo
adu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
| 126 mull = _mm_madd_epi16(q0, _mm_lo
adu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
| 127 summ = _mm_sra_epi32(summ, cnt); |
| 128 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
| 129 } |
| 130 } |
| 131 } |
| 132 else { |
| 133 if(order == 10) { |
| 134 __m128i q0, q1, q2, q3, q4, q5, q6, q7,
q8, q9; |
| 135 q0 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
| 136 q1 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
| 137 q2 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
| 138 q3 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
| 139 q4 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
| 140 q5 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); |
| 141 q6 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); |
| 142 q7 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); |
| 143 q8 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0)); |
| 144 q9 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0)); |
| 145 |
| 146 for(i = 0; i < (int)data_len-3; i+=4) { |
| 147 __m128i summ, mull; |
| 148 summ = _mm_madd_epi16(q9, _mm_lo
adu_si128((const __m128i*)(data+i-10))); |
| 149 mull = _mm_madd_epi16(q8, _mm_lo
adu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull); |
| 150 mull = _mm_madd_epi16(q7, _mm_lo
adu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull); |
| 151 mull = _mm_madd_epi16(q6, _mm_lo
adu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); |
| 152 mull = _mm_madd_epi16(q5, _mm_lo
adu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); |
| 153 mull = _mm_madd_epi16(q4, _mm_lo
adu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); |
| 154 mull = _mm_madd_epi16(q3, _mm_lo
adu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
| 155 mull = _mm_madd_epi16(q2, _mm_lo
adu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
| 156 mull = _mm_madd_epi16(q1, _mm_lo
adu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
| 157 mull = _mm_madd_epi16(q0, _mm_lo
adu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
| 158 summ = _mm_sra_epi32(summ, cnt); |
| 159 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
| 160 } |
| 161 } |
| 162 else { /* order == 9 */ |
| 163 __m128i q0, q1, q2, q3, q4, q5, q6, q7,
q8; |
| 164 q0 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
| 165 q1 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
| 166 q2 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
| 167 q3 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
| 168 q4 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
| 169 q5 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); |
| 170 q6 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); |
| 171 q7 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); |
| 172 q8 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0)); |
| 173 |
| 174 for(i = 0; i < (int)data_len-3; i+=4) { |
| 175 __m128i summ, mull; |
| 176 summ = _mm_madd_epi16(q8, _mm_lo
adu_si128((const __m128i*)(data+i-9))); |
| 177 mull = _mm_madd_epi16(q7, _mm_lo
adu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull); |
| 178 mull = _mm_madd_epi16(q6, _mm_lo
adu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); |
| 179 mull = _mm_madd_epi16(q5, _mm_lo
adu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); |
| 180 mull = _mm_madd_epi16(q4, _mm_lo
adu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); |
| 181 mull = _mm_madd_epi16(q3, _mm_lo
adu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
| 182 mull = _mm_madd_epi16(q2, _mm_lo
adu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
| 183 mull = _mm_madd_epi16(q1, _mm_lo
adu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
| 184 mull = _mm_madd_epi16(q0, _mm_lo
adu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
| 185 summ = _mm_sra_epi32(summ, cnt); |
| 186 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
| 187 } |
| 188 } |
| 189 } |
| 190 } |
| 191 else if(order > 4) { |
| 192 if(order > 6) { |
| 193 if(order == 8) { |
| 194 __m128i q0, q1, q2, q3, q4, q5, q6, q7; |
| 195 q0 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
| 196 q1 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
| 197 q2 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
| 198 q3 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
| 199 q4 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
| 200 q5 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); |
| 201 q6 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); |
| 202 q7 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); |
| 203 |
| 204 for(i = 0; i < (int)data_len-3; i+=4) { |
| 205 __m128i summ, mull; |
| 206 summ = _mm_madd_epi16(q7, _mm_lo
adu_si128((const __m128i*)(data+i-8))); |
| 207 mull = _mm_madd_epi16(q6, _mm_lo
adu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); |
| 208 mull = _mm_madd_epi16(q5, _mm_lo
adu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); |
| 209 mull = _mm_madd_epi16(q4, _mm_lo
adu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); |
| 210 mull = _mm_madd_epi16(q3, _mm_lo
adu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
| 211 mull = _mm_madd_epi16(q2, _mm_lo
adu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
| 212 mull = _mm_madd_epi16(q1, _mm_lo
adu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
| 213 mull = _mm_madd_epi16(q0, _mm_lo
adu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
| 214 summ = _mm_sra_epi32(summ, cnt); |
| 215 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
| 216 } |
| 217 } |
| 218 else { /* order == 7 */ |
| 219 __m128i q0, q1, q2, q3, q4, q5, q6; |
| 220 q0 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
| 221 q1 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
| 222 q2 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
| 223 q3 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
| 224 q4 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
| 225 q5 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); |
| 226 q6 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); |
| 227 |
| 228 for(i = 0; i < (int)data_len-3; i+=4) { |
| 229 __m128i summ, mull; |
| 230 summ = _mm_madd_epi16(q6, _mm_lo
adu_si128((const __m128i*)(data+i-7))); |
| 231 mull = _mm_madd_epi16(q5, _mm_lo
adu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); |
| 232 mull = _mm_madd_epi16(q4, _mm_lo
adu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); |
| 233 mull = _mm_madd_epi16(q3, _mm_lo
adu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
| 234 mull = _mm_madd_epi16(q2, _mm_lo
adu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
| 235 mull = _mm_madd_epi16(q1, _mm_lo
adu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
| 236 mull = _mm_madd_epi16(q0, _mm_lo
adu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
| 237 summ = _mm_sra_epi32(summ, cnt); |
| 238 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
| 239 } |
| 240 } |
| 241 } |
| 242 else { |
| 243 if(order == 6) { |
| 244 __m128i q0, q1, q2, q3, q4, q5; |
| 245 q0 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
| 246 q1 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
| 247 q2 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
| 248 q3 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
| 249 q4 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
| 250 q5 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); |
| 251 |
| 252 for(i = 0; i < (int)data_len-3; i+=4) { |
| 253 __m128i summ, mull; |
| 254 summ = _mm_madd_epi16(q5, _mm_lo
adu_si128((const __m128i*)(data+i-6))); |
| 255 mull = _mm_madd_epi16(q4, _mm_lo
adu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); |
| 256 mull = _mm_madd_epi16(q3, _mm_lo
adu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
| 257 mull = _mm_madd_epi16(q2, _mm_lo
adu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
| 258 mull = _mm_madd_epi16(q1, _mm_lo
adu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
| 259 mull = _mm_madd_epi16(q0, _mm_lo
adu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
| 260 summ = _mm_sra_epi32(summ, cnt); |
| 261 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
| 262 } |
| 263 } |
| 264 else { /* order == 5 */ |
| 265 __m128i q0, q1, q2, q3, q4; |
| 266 q0 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
| 267 q1 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
| 268 q2 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
| 269 q3 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
| 270 q4 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); |
| 271 |
| 272 for(i = 0; i < (int)data_len-3; i+=4) { |
| 273 __m128i summ, mull; |
| 274 summ = _mm_madd_epi16(q4, _mm_lo
adu_si128((const __m128i*)(data+i-5))); |
| 275 mull = _mm_madd_epi16(q3, _mm_lo
adu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); |
| 276 mull = _mm_madd_epi16(q2, _mm_lo
adu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
| 277 mull = _mm_madd_epi16(q1, _mm_lo
adu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
| 278 mull = _mm_madd_epi16(q0, _mm_lo
adu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
| 279 summ = _mm_sra_epi32(summ, cnt); |
| 280 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
| 281 } |
| 282 } |
| 283 } |
| 284 } |
| 285 else { |
| 286 if(order > 2) { |
| 287 if(order == 4) { |
| 288 __m128i q0, q1, q2, q3; |
| 289 q0 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
| 290 q1 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
| 291 q2 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
| 292 q3 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); |
| 293 |
| 294 for(i = 0; i < (int)data_len-3; i+=4) { |
| 295 __m128i summ, mull; |
| 296 summ = _mm_madd_epi16(q3, _mm_lo
adu_si128((const __m128i*)(data+i-4))); |
| 297 mull = _mm_madd_epi16(q2, _mm_lo
adu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); |
| 298 mull = _mm_madd_epi16(q1, _mm_lo
adu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
| 299 mull = _mm_madd_epi16(q0, _mm_lo
adu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
| 300 summ = _mm_sra_epi32(summ, cnt); |
| 301 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
| 302 } |
| 303 } |
| 304 else { /* order == 3 */ |
| 305 __m128i q0, q1, q2; |
| 306 q0 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
| 307 q1 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
| 308 q2 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); |
| 309 |
| 310 for(i = 0; i < (int)data_len-3; i+=4) { |
| 311 __m128i summ, mull; |
| 312 summ = _mm_madd_epi16(q2, _mm_lo
adu_si128((const __m128i*)(data+i-3))); |
| 313 mull = _mm_madd_epi16(q1, _mm_lo
adu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); |
| 314 mull = _mm_madd_epi16(q0, _mm_lo
adu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
| 315 summ = _mm_sra_epi32(summ, cnt); |
| 316 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
| 317 } |
| 318 } |
| 319 } |
| 320 else { |
| 321 if(order == 2) { |
| 322 __m128i q0, q1; |
| 323 q0 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
| 324 q1 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); |
| 325 |
| 326 for(i = 0; i < (int)data_len-3; i+=4) { |
| 327 __m128i summ, mull; |
| 328 summ = _mm_madd_epi16(q1, _mm_lo
adu_si128((const __m128i*)(data+i-2))); |
| 329 mull = _mm_madd_epi16(q0, _mm_lo
adu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); |
| 330 summ = _mm_sra_epi32(summ, cnt); |
| 331 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
| 332 } |
| 333 } |
| 334 else { /* order == 1 */ |
| 335 __m128i q0; |
| 336 q0 = _mm_cvtsi32_si128(0xffff & qlp_coef
f[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); |
| 337 |
| 338 for(i = 0; i < (int)data_len-3; i+=4) { |
| 339 __m128i summ; |
| 340 summ = _mm_madd_epi16(q0, _mm_lo
adu_si128((const __m128i*)(data+i-1))); |
| 341 summ = _mm_sra_epi32(summ, cnt); |
| 342 _mm_storeu_si128((__m128i*)(resi
dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); |
| 343 } |
| 344 } |
| 345 } |
| 346 } |
| 347 for(; i < (int)data_len; i++) { |
| 348 sum = 0; |
| 349 switch(order) { |
| 350 case 12: sum += qlp_coeff[11] * data[i-12]; |
| 351 case 11: sum += qlp_coeff[10] * data[i-11]; |
| 352 case 10: sum += qlp_coeff[ 9] * data[i-10]; |
| 353 case 9: sum += qlp_coeff[ 8] * data[i- 9]; |
| 354 case 8: sum += qlp_coeff[ 7] * data[i- 8]; |
| 355 case 7: sum += qlp_coeff[ 6] * data[i- 7]; |
| 356 case 6: sum += qlp_coeff[ 5] * data[i- 6]; |
| 357 case 5: sum += qlp_coeff[ 4] * data[i- 5]; |
| 358 case 4: sum += qlp_coeff[ 3] * data[i- 4]; |
| 359 case 3: sum += qlp_coeff[ 2] * data[i- 3]; |
| 360 case 2: sum += qlp_coeff[ 1] * data[i- 2]; |
| 361 case 1: sum += qlp_coeff[ 0] * data[i- 1]; |
| 362 } |
| 363 residual[i] = data[i] - (sum >> lp_quantization); |
| 364 } |
| 365 } |
| 366 else { /* order > 12 */ |
| 367 for(i = 0; i < (int)data_len; i++) { |
| 368 sum = 0; |
| 369 switch(order) { |
| 370 case 32: sum += qlp_coeff[31] * data[i-32]; |
| 371 case 31: sum += qlp_coeff[30] * data[i-31]; |
| 372 case 30: sum += qlp_coeff[29] * data[i-30]; |
| 373 case 29: sum += qlp_coeff[28] * data[i-29]; |
| 374 case 28: sum += qlp_coeff[27] * data[i-28]; |
| 375 case 27: sum += qlp_coeff[26] * data[i-27]; |
| 376 case 26: sum += qlp_coeff[25] * data[i-26]; |
| 377 case 25: sum += qlp_coeff[24] * data[i-25]; |
| 378 case 24: sum += qlp_coeff[23] * data[i-24]; |
| 379 case 23: sum += qlp_coeff[22] * data[i-23]; |
| 380 case 22: sum += qlp_coeff[21] * data[i-22]; |
| 381 case 21: sum += qlp_coeff[20] * data[i-21]; |
| 382 case 20: sum += qlp_coeff[19] * data[i-20]; |
| 383 case 19: sum += qlp_coeff[18] * data[i-19]; |
| 384 case 18: sum += qlp_coeff[17] * data[i-18]; |
| 385 case 17: sum += qlp_coeff[16] * data[i-17]; |
| 386 case 16: sum += qlp_coeff[15] * data[i-16]; |
| 387 case 15: sum += qlp_coeff[14] * data[i-15]; |
| 388 case 14: sum += qlp_coeff[13] * data[i-14]; |
| 389 case 13: sum += qlp_coeff[12] * data[i-13]; |
| 390 sum += qlp_coeff[11] * data[i-12]; |
| 391 sum += qlp_coeff[10] * data[i-11]; |
| 392 sum += qlp_coeff[ 9] * data[i-10]; |
| 393 sum += qlp_coeff[ 8] * data[i- 9]; |
| 394 sum += qlp_coeff[ 7] * data[i- 8]; |
| 395 sum += qlp_coeff[ 6] * data[i- 7]; |
| 396 sum += qlp_coeff[ 5] * data[i- 6]; |
| 397 sum += qlp_coeff[ 4] * data[i- 5]; |
| 398 sum += qlp_coeff[ 3] * data[i- 4]; |
| 399 sum += qlp_coeff[ 2] * data[i- 3]; |
| 400 sum += qlp_coeff[ 1] * data[i- 2]; |
| 401 sum += qlp_coeff[ 0] * data[i- 1]; |
| 402 } |
| 403 residual[i] = data[i] - (sum >> lp_quantization); |
| 404 } |
| 405 } |
| 406 } |
| 407 |
| 408 FLAC__SSE_TARGET("sse2") |
| 409 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
t32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int
lp_quantization, FLAC__int32 residual[]) |
| 410 { |
| 411 int i; |
| 412 |
| 413 FLAC__ASSERT(order > 0); |
| 414 FLAC__ASSERT(order <= 32); |
| 415 |
| 416 if(order <= 12) { |
| 417 if(order > 8) { /* order == 9, 10, 11, 12 */ |
| 418 if(order > 10) { /* order == 11, 12 */ |
| 419 if(order == 12) { |
| 420 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xm
m5, xmm6, xmm7; |
| 421 xmm0 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+0)); // 0 0 q[1] q[0] |
| 422 xmm1 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+2)); // 0 0 q[3] q[2] |
| 423 xmm2 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+4)); // 0 0 q[5] q[4] |
| 424 xmm3 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+6)); // 0 0 q[7] q[6] |
| 425 xmm4 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+8)); // 0 0 q[9] q[8] |
| 426 xmm5 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+10)); // 0 0 q[11] q[10] |
| 427 |
| 428 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF
LE(3,1,2,0)); // 0 q[1] 0 q[0] |
| 429 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFF
LE(3,1,2,0)); // 0 q[3] 0 q[2] |
| 430 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFF
LE(3,1,2,0)); // 0 q[5] 0 q[4] |
| 431 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFF
LE(3,1,2,0)); // 0 q[7] 0 q[6] |
| 432 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFF
LE(3,1,2,0)); // 0 q[9] 0 q[8] |
| 433 xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFF
LE(3,1,2,0)); // 0 q[11] 0 q[10] |
| 434 |
| 435 for(i = 0; i < (int)data_len; i++) { |
| 436 //sum = 0; |
| 437 //sum += qlp_coeff[11] * data[i-
12]; |
| 438 //sum += qlp_coeff[10] * data[i-
11]; |
| 439 xmm7 = _mm_loadl_epi64((const __
m128i*)(data+i-12)); // 0 0 d[i-11] d[i-12] |
| 440 xmm7 = _mm_shuffle_epi32(xmm7, _
MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11] |
| 441 xmm7 = _mm_mul_epu32(xmm7, xmm5)
; /* we use _unsigned_ multiplication and discard high dword of the result value
s */ |
| 442 |
| 443 //sum += qlp_coeff[9] * data[i-1
0]; |
| 444 //sum += qlp_coeff[8] * data[i-9
]; |
| 445 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-10)); |
| 446 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 447 xmm6 = _mm_mul_epu32(xmm6, xmm4)
; |
| 448 xmm7 = _mm_add_epi32(xmm7, xmm6)
; |
| 449 |
| 450 //sum += qlp_coeff[7] * data[i-8
]; |
| 451 //sum += qlp_coeff[6] * data[i-7
]; |
| 452 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-8)); |
| 453 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 454 xmm6 = _mm_mul_epu32(xmm6, xmm3)
; |
| 455 xmm7 = _mm_add_epi32(xmm7, xmm6)
; |
| 456 |
| 457 //sum += qlp_coeff[5] * data[i-6
]; |
| 458 //sum += qlp_coeff[4] * data[i-5
]; |
| 459 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-6)); |
| 460 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 461 xmm6 = _mm_mul_epu32(xmm6, xmm2)
; |
| 462 xmm7 = _mm_add_epi32(xmm7, xmm6)
; |
| 463 |
| 464 //sum += qlp_coeff[3] * data[i-4
]; |
| 465 //sum += qlp_coeff[2] * data[i-3
]; |
| 466 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-4)); |
| 467 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 468 xmm6 = _mm_mul_epu32(xmm6, xmm1)
; |
| 469 xmm7 = _mm_add_epi32(xmm7, xmm6)
; |
| 470 |
| 471 //sum += qlp_coeff[1] * data[i-2
]; |
| 472 //sum += qlp_coeff[0] * data[i-1
]; |
| 473 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-2)); |
| 474 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 475 xmm6 = _mm_mul_epu32(xmm6, xmm0)
; |
| 476 xmm7 = _mm_add_epi32(xmm7, xmm6)
; |
| 477 |
| 478 xmm7 = _mm_add_epi32(xmm7, _mm_s
rli_si128(xmm7, 8)); |
| 479 RESIDUAL32_RESULT(xmm7); |
| 480 } |
| 481 } |
| 482 else { /* order == 11 */ |
| 483 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xm
m5, xmm6, xmm7; |
| 484 xmm0 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+0)); |
| 485 xmm1 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+2)); |
| 486 xmm2 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+4)); |
| 487 xmm3 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+6)); |
| 488 xmm4 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+8)); |
| 489 xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]); |
| 490 |
| 491 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF
LE(3,1,2,0)); |
| 492 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFF
LE(3,1,2,0)); |
| 493 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFF
LE(3,1,2,0)); |
| 494 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFF
LE(3,1,2,0)); |
| 495 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFF
LE(3,1,2,0)); |
| 496 |
| 497 for(i = 0; i < (int)data_len; i++) { |
| 498 //sum = 0; |
| 499 //sum = qlp_coeff[10] * data[i-
11]; |
| 500 xmm7 = _mm_cvtsi32_si128(data[i-
11]); |
| 501 xmm7 = _mm_mul_epu32(xmm7, xmm5)
; |
| 502 |
| 503 //sum += qlp_coeff[9] * data[i-1
0]; |
| 504 //sum += qlp_coeff[8] * data[i-9
]; |
| 505 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-10)); |
| 506 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 507 xmm6 = _mm_mul_epu32(xmm6, xmm4)
; |
| 508 xmm7 = _mm_add_epi32(xmm7, xmm6)
; |
| 509 |
| 510 //sum += qlp_coeff[7] * data[i-8
]; |
| 511 //sum += qlp_coeff[6] * data[i-7
]; |
| 512 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-8)); |
| 513 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 514 xmm6 = _mm_mul_epu32(xmm6, xmm3)
; |
| 515 xmm7 = _mm_add_epi32(xmm7, xmm6)
; |
| 516 |
| 517 //sum += qlp_coeff[5] * data[i-6
]; |
| 518 //sum += qlp_coeff[4] * data[i-5
]; |
| 519 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-6)); |
| 520 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 521 xmm6 = _mm_mul_epu32(xmm6, xmm2)
; |
| 522 xmm7 = _mm_add_epi32(xmm7, xmm6)
; |
| 523 |
| 524 //sum += qlp_coeff[3] * data[i-4
]; |
| 525 //sum += qlp_coeff[2] * data[i-3
]; |
| 526 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-4)); |
| 527 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 528 xmm6 = _mm_mul_epu32(xmm6, xmm1)
; |
| 529 xmm7 = _mm_add_epi32(xmm7, xmm6)
; |
| 530 |
| 531 //sum += qlp_coeff[1] * data[i-2
]; |
| 532 //sum += qlp_coeff[0] * data[i-1
]; |
| 533 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-2)); |
| 534 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 535 xmm6 = _mm_mul_epu32(xmm6, xmm0)
; |
| 536 xmm7 = _mm_add_epi32(xmm7, xmm6)
; |
| 537 |
| 538 xmm7 = _mm_add_epi32(xmm7, _mm_s
rli_si128(xmm7, 8)); |
| 539 RESIDUAL32_RESULT(xmm7); |
| 540 } |
| 541 } |
| 542 } |
| 543 else { /* order == 9, 10 */ |
| 544 if(order == 10) { |
| 545 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xm
m6, xmm7; |
| 546 xmm0 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+0)); |
| 547 xmm1 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+2)); |
| 548 xmm2 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+4)); |
| 549 xmm3 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+6)); |
| 550 xmm4 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+8)); |
| 551 |
| 552 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF
LE(3,1,2,0)); |
| 553 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFF
LE(3,1,2,0)); |
| 554 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFF
LE(3,1,2,0)); |
| 555 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFF
LE(3,1,2,0)); |
| 556 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFF
LE(3,1,2,0)); |
| 557 |
| 558 for(i = 0; i < (int)data_len; i++) { |
| 559 //sum = 0; |
| 560 //sum += qlp_coeff[9] * data[i-1
0]; |
| 561 //sum += qlp_coeff[8] * data[i-9
]; |
| 562 xmm7 = _mm_loadl_epi64((const __
m128i*)(data+i-10)); |
| 563 xmm7 = _mm_shuffle_epi32(xmm7, _
MM_SHUFFLE(2,0,3,1)); |
| 564 xmm7 = _mm_mul_epu32(xmm7, xmm4)
; |
| 565 |
| 566 //sum += qlp_coeff[7] * data[i-8
]; |
| 567 //sum += qlp_coeff[6] * data[i-7
]; |
| 568 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-8)); |
| 569 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 570 xmm6 = _mm_mul_epu32(xmm6, xmm3)
; |
| 571 xmm7 = _mm_add_epi32(xmm7, xmm6)
; |
| 572 |
| 573 //sum += qlp_coeff[5] * data[i-6
]; |
| 574 //sum += qlp_coeff[4] * data[i-5
]; |
| 575 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-6)); |
| 576 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 577 xmm6 = _mm_mul_epu32(xmm6, xmm2)
; |
| 578 xmm7 = _mm_add_epi32(xmm7, xmm6)
; |
| 579 |
| 580 //sum += qlp_coeff[3] * data[i-4
]; |
| 581 //sum += qlp_coeff[2] * data[i-3
]; |
| 582 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-4)); |
| 583 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 584 xmm6 = _mm_mul_epu32(xmm6, xmm1)
; |
| 585 xmm7 = _mm_add_epi32(xmm7, xmm6)
; |
| 586 |
| 587 //sum += qlp_coeff[1] * data[i-2
]; |
| 588 //sum += qlp_coeff[0] * data[i-1
]; |
| 589 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-2)); |
| 590 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 591 xmm6 = _mm_mul_epu32(xmm6, xmm0)
; |
| 592 xmm7 = _mm_add_epi32(xmm7, xmm6)
; |
| 593 |
| 594 xmm7 = _mm_add_epi32(xmm7, _mm_s
rli_si128(xmm7, 8)); |
| 595 RESIDUAL32_RESULT(xmm7); |
| 596 } |
| 597 } |
| 598 else { /* order == 9 */ |
| 599 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xm
m6, xmm7; |
| 600 xmm0 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+0)); |
| 601 xmm1 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+2)); |
| 602 xmm2 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+4)); |
| 603 xmm3 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+6)); |
| 604 xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]); |
| 605 |
| 606 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF
LE(3,1,2,0)); |
| 607 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFF
LE(3,1,2,0)); |
| 608 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFF
LE(3,1,2,0)); |
| 609 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFF
LE(3,1,2,0)); |
| 610 |
| 611 for(i = 0; i < (int)data_len; i++) { |
| 612 //sum = 0; |
| 613 //sum = qlp_coeff[8] * data[i-9
]; |
| 614 xmm7 = _mm_cvtsi32_si128(data[i-
9]); |
| 615 xmm7 = _mm_mul_epu32(xmm7, xmm4)
; |
| 616 |
| 617 //sum += qlp_coeff[7] * data[i-8
]; |
| 618 //sum += qlp_coeff[6] * data[i-7
]; |
| 619 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-8)); |
| 620 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 621 xmm6 = _mm_mul_epu32(xmm6, xmm3)
; |
| 622 xmm7 = _mm_add_epi32(xmm7, xmm6)
; |
| 623 |
| 624 //sum += qlp_coeff[5] * data[i-6
]; |
| 625 //sum += qlp_coeff[4] * data[i-5
]; |
| 626 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-6)); |
| 627 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 628 xmm6 = _mm_mul_epu32(xmm6, xmm2)
; |
| 629 xmm7 = _mm_add_epi32(xmm7, xmm6)
; |
| 630 |
| 631 //sum += qlp_coeff[3] * data[i-4
]; |
| 632 //sum += qlp_coeff[2] * data[i-3
]; |
| 633 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-4)); |
| 634 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 635 xmm6 = _mm_mul_epu32(xmm6, xmm1)
; |
| 636 xmm7 = _mm_add_epi32(xmm7, xmm6)
; |
| 637 |
| 638 //sum += qlp_coeff[1] * data[i-2
]; |
| 639 //sum += qlp_coeff[0] * data[i-1
]; |
| 640 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-2)); |
| 641 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 642 xmm6 = _mm_mul_epu32(xmm6, xmm0)
; |
| 643 xmm7 = _mm_add_epi32(xmm7, xmm6)
; |
| 644 |
| 645 xmm7 = _mm_add_epi32(xmm7, _mm_s
rli_si128(xmm7, 8)); |
| 646 RESIDUAL32_RESULT(xmm7); |
| 647 } |
| 648 } |
| 649 } |
| 650 } |
| 651 else if(order > 4) { /* order == 5, 6, 7, 8 */ |
| 652 if(order > 6) { /* order == 7, 8 */ |
| 653 if(order == 8) { |
| 654 __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xm
m7; |
| 655 xmm0 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+0)); |
| 656 xmm1 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+2)); |
| 657 xmm2 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+4)); |
| 658 xmm3 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+6)); |
| 659 |
| 660 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF
LE(3,1,2,0)); |
| 661 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFF
LE(3,1,2,0)); |
| 662 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFF
LE(3,1,2,0)); |
| 663 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFF
LE(3,1,2,0)); |
| 664 |
| 665 for(i = 0; i < (int)data_len; i++) { |
| 666 //sum = 0; |
| 667 //sum += qlp_coeff[7] * data[i-8
]; |
| 668 //sum += qlp_coeff[6] * data[i-7
]; |
| 669 xmm7 = _mm_loadl_epi64((const __
m128i*)(data+i-8)); |
| 670 xmm7 = _mm_shuffle_epi32(xmm7, _
MM_SHUFFLE(2,0,3,1)); |
| 671 xmm7 = _mm_mul_epu32(xmm7, xmm3)
; |
| 672 |
| 673 //sum += qlp_coeff[5] * data[i-6
]; |
| 674 //sum += qlp_coeff[4] * data[i-5
]; |
| 675 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-6)); |
| 676 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 677 xmm6 = _mm_mul_epu32(xmm6, xmm2)
; |
| 678 xmm7 = _mm_add_epi32(xmm7, xmm6)
; |
| 679 |
| 680 //sum += qlp_coeff[3] * data[i-4
]; |
| 681 //sum += qlp_coeff[2] * data[i-3
]; |
| 682 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-4)); |
| 683 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 684 xmm6 = _mm_mul_epu32(xmm6, xmm1)
; |
| 685 xmm7 = _mm_add_epi32(xmm7, xmm6)
; |
| 686 |
| 687 //sum += qlp_coeff[1] * data[i-2
]; |
| 688 //sum += qlp_coeff[0] * data[i-1
]; |
| 689 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-2)); |
| 690 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 691 xmm6 = _mm_mul_epu32(xmm6, xmm0)
; |
| 692 xmm7 = _mm_add_epi32(xmm7, xmm6)
; |
| 693 |
| 694 xmm7 = _mm_add_epi32(xmm7, _mm_s
rli_si128(xmm7, 8)); |
| 695 RESIDUAL32_RESULT(xmm7); |
| 696 } |
| 697 } |
| 698 else { /* order == 7 */ |
| 699 __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xm
m7; |
| 700 xmm0 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+0)); |
| 701 xmm1 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+2)); |
| 702 xmm2 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+4)); |
| 703 xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]); |
| 704 |
| 705 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF
LE(3,1,2,0)); |
| 706 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFF
LE(3,1,2,0)); |
| 707 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFF
LE(3,1,2,0)); |
| 708 |
| 709 for(i = 0; i < (int)data_len; i++) { |
| 710 //sum = 0; |
| 711 //sum = qlp_coeff[6] * data[i-7
]; |
| 712 xmm7 = _mm_cvtsi32_si128(data[i-
7]); |
| 713 xmm7 = _mm_mul_epu32(xmm7, xmm3)
; |
| 714 |
| 715 //sum += qlp_coeff[5] * data[i-6
]; |
| 716 //sum += qlp_coeff[4] * data[i-5
]; |
| 717 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-6)); |
| 718 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 719 xmm6 = _mm_mul_epu32(xmm6, xmm2)
; |
| 720 xmm7 = _mm_add_epi32(xmm7, xmm6)
; |
| 721 |
| 722 //sum += qlp_coeff[3] * data[i-4
]; |
| 723 //sum += qlp_coeff[2] * data[i-3
]; |
| 724 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-4)); |
| 725 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 726 xmm6 = _mm_mul_epu32(xmm6, xmm1)
; |
| 727 xmm7 = _mm_add_epi32(xmm7, xmm6)
; |
| 728 |
| 729 //sum += qlp_coeff[1] * data[i-2
]; |
| 730 //sum += qlp_coeff[0] * data[i-1
]; |
| 731 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-2)); |
| 732 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 733 xmm6 = _mm_mul_epu32(xmm6, xmm0)
; |
| 734 xmm7 = _mm_add_epi32(xmm7, xmm6)
; |
| 735 |
| 736 xmm7 = _mm_add_epi32(xmm7, _mm_s
rli_si128(xmm7, 8)); |
| 737 RESIDUAL32_RESULT(xmm7); |
| 738 } |
| 739 } |
| 740 } |
| 741 else { /* order == 5, 6 */ |
| 742 if(order == 6) { |
| 743 __m128i xmm0, xmm1, xmm2, xmm6, xmm7; |
| 744 xmm0 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+0)); |
| 745 xmm1 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+2)); |
| 746 xmm2 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+4)); |
| 747 |
| 748 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF
LE(3,1,2,0)); |
| 749 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFF
LE(3,1,2,0)); |
| 750 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFF
LE(3,1,2,0)); |
| 751 |
| 752 for(i = 0; i < (int)data_len; i++) { |
| 753 //sum = 0; |
| 754 //sum += qlp_coeff[5] * data[i-6
]; |
| 755 //sum += qlp_coeff[4] * data[i-5
]; |
| 756 xmm7 = _mm_loadl_epi64((const __
m128i*)(data+i-6)); |
| 757 xmm7 = _mm_shuffle_epi32(xmm7, _
MM_SHUFFLE(2,0,3,1)); |
| 758 xmm7 = _mm_mul_epu32(xmm7, xmm2)
; |
| 759 |
| 760 //sum += qlp_coeff[3] * data[i-4
]; |
| 761 //sum += qlp_coeff[2] * data[i-3
]; |
| 762 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-4)); |
| 763 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 764 xmm6 = _mm_mul_epu32(xmm6, xmm1)
; |
| 765 xmm7 = _mm_add_epi32(xmm7, xmm6)
; |
| 766 |
| 767 //sum += qlp_coeff[1] * data[i-2
]; |
| 768 //sum += qlp_coeff[0] * data[i-1
]; |
| 769 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-2)); |
| 770 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 771 xmm6 = _mm_mul_epu32(xmm6, xmm0)
; |
| 772 xmm7 = _mm_add_epi32(xmm7, xmm6)
; |
| 773 |
| 774 xmm7 = _mm_add_epi32(xmm7, _mm_s
rli_si128(xmm7, 8)); |
| 775 RESIDUAL32_RESULT(xmm7); |
| 776 } |
| 777 } |
| 778 else { /* order == 5 */ |
| 779 __m128i xmm0, xmm1, xmm2, xmm6, xmm7; |
| 780 xmm0 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+0)); |
| 781 xmm1 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+2)); |
| 782 xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]); |
| 783 |
| 784 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF
LE(3,1,2,0)); |
| 785 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFF
LE(3,1,2,0)); |
| 786 |
| 787 for(i = 0; i < (int)data_len; i++) { |
| 788 //sum = 0; |
| 789 //sum = qlp_coeff[4] * data[i-5
]; |
| 790 xmm7 = _mm_cvtsi32_si128(data[i-
5]); |
| 791 xmm7 = _mm_mul_epu32(xmm7, xmm2)
; |
| 792 |
| 793 //sum += qlp_coeff[3] * data[i-4
]; |
| 794 //sum += qlp_coeff[2] * data[i-3
]; |
| 795 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-4)); |
| 796 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 797 xmm6 = _mm_mul_epu32(xmm6, xmm1)
; |
| 798 xmm7 = _mm_add_epi32(xmm7, xmm6)
; |
| 799 |
| 800 //sum += qlp_coeff[1] * data[i-2
]; |
| 801 //sum += qlp_coeff[0] * data[i-1
]; |
| 802 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-2)); |
| 803 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 804 xmm6 = _mm_mul_epu32(xmm6, xmm0)
; |
| 805 xmm7 = _mm_add_epi32(xmm7, xmm6)
; |
| 806 |
| 807 xmm7 = _mm_add_epi32(xmm7, _mm_s
rli_si128(xmm7, 8)); |
| 808 RESIDUAL32_RESULT(xmm7); |
| 809 } |
| 810 } |
| 811 } |
| 812 } |
| 813 else { /* order == 1, 2, 3, 4 */ |
| 814 if(order > 2) { /* order == 3, 4 */ |
| 815 if(order == 4) { |
| 816 __m128i xmm0, xmm1, xmm6, xmm7; |
| 817 xmm0 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+0)); |
| 818 xmm1 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+2)); |
| 819 |
| 820 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF
LE(3,1,2,0)); |
| 821 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFF
LE(3,1,2,0)); |
| 822 |
| 823 for(i = 0; i < (int)data_len; i++) { |
| 824 //sum = 0; |
| 825 //sum += qlp_coeff[3] * data[i-4
]; |
| 826 //sum += qlp_coeff[2] * data[i-3
]; |
| 827 xmm7 = _mm_loadl_epi64((const __
m128i*)(data+i-4)); |
| 828 xmm7 = _mm_shuffle_epi32(xmm7, _
MM_SHUFFLE(2,0,3,1)); |
| 829 xmm7 = _mm_mul_epu32(xmm7, xmm1)
; |
| 830 |
| 831 //sum += qlp_coeff[1] * data[i-2
]; |
| 832 //sum += qlp_coeff[0] * data[i-1
]; |
| 833 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-2)); |
| 834 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 835 xmm6 = _mm_mul_epu32(xmm6, xmm0)
; |
| 836 xmm7 = _mm_add_epi32(xmm7, xmm6)
; |
| 837 |
| 838 xmm7 = _mm_add_epi32(xmm7, _mm_s
rli_si128(xmm7, 8)); |
| 839 RESIDUAL32_RESULT(xmm7); |
| 840 } |
| 841 } |
| 842 else { /* order == 3 */ |
| 843 __m128i xmm0, xmm1, xmm6, xmm7; |
| 844 xmm0 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+0)); |
| 845 xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]); |
| 846 |
| 847 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF
LE(3,1,2,0)); |
| 848 |
| 849 for(i = 0; i < (int)data_len; i++) { |
| 850 //sum = 0; |
| 851 //sum = qlp_coeff[2] * data[i-3
]; |
| 852 xmm7 = _mm_cvtsi32_si128(data[i-
3]); |
| 853 xmm7 = _mm_mul_epu32(xmm7, xmm1)
; |
| 854 |
| 855 //sum += qlp_coeff[1] * data[i-2
]; |
| 856 //sum += qlp_coeff[0] * data[i-1
]; |
| 857 xmm6 = _mm_loadl_epi64((const __
m128i*)(data+i-2)); |
| 858 xmm6 = _mm_shuffle_epi32(xmm6, _
MM_SHUFFLE(2,0,3,1)); |
| 859 xmm6 = _mm_mul_epu32(xmm6, xmm0)
; |
| 860 xmm7 = _mm_add_epi32(xmm7, xmm6)
; |
| 861 |
| 862 xmm7 = _mm_add_epi32(xmm7, _mm_s
rli_si128(xmm7, 8)); |
| 863 RESIDUAL32_RESULT(xmm7); |
| 864 } |
| 865 } |
| 866 } |
| 867 else { /* order == 1, 2 */ |
| 868 if(order == 2) { |
| 869 __m128i xmm0, xmm7; |
| 870 xmm0 = _mm_loadl_epi64((const __m128i*)(
qlp_coeff+0)); |
| 871 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF
LE(3,1,2,0)); |
| 872 |
| 873 for(i = 0; i < (int)data_len; i++) { |
| 874 //sum = 0; |
| 875 //sum += qlp_coeff[1] * data[i-2
]; |
| 876 //sum += qlp_coeff[0] * data[i-1
]; |
| 877 xmm7 = _mm_loadl_epi64((const __
m128i*)(data+i-2)); |
| 878 xmm7 = _mm_shuffle_epi32(xmm7, _
MM_SHUFFLE(2,0,3,1)); |
| 879 xmm7 = _mm_mul_epu32(xmm7, xmm0)
; |
| 880 |
| 881 xmm7 = _mm_add_epi32(xmm7, _mm_s
rli_si128(xmm7, 8)); |
| 882 RESIDUAL32_RESULT(xmm7); |
| 883 } |
| 884 } |
| 885 else { /* order == 1 */ |
| 886 for(i = 0; i < (int)data_len; i++) |
| 887 residual[i] = data[i] - ((qlp_co
eff[0] * data[i-1]) >> lp_quantization); |
| 888 } |
| 889 } |
| 890 } |
| 891 } |
| 892 else { /* order > 12 */ |
| 893 FLAC__int32 sum; |
| 894 for(i = 0; i < (int)data_len; i++) { |
| 895 sum = 0; |
| 896 switch(order) { |
| 897 case 32: sum += qlp_coeff[31] * data[i-32]; |
| 898 case 31: sum += qlp_coeff[30] * data[i-31]; |
| 899 case 30: sum += qlp_coeff[29] * data[i-30]; |
| 900 case 29: sum += qlp_coeff[28] * data[i-29]; |
| 901 case 28: sum += qlp_coeff[27] * data[i-28]; |
| 902 case 27: sum += qlp_coeff[26] * data[i-27]; |
| 903 case 26: sum += qlp_coeff[25] * data[i-26]; |
| 904 case 25: sum += qlp_coeff[24] * data[i-25]; |
| 905 case 24: sum += qlp_coeff[23] * data[i-24]; |
| 906 case 23: sum += qlp_coeff[22] * data[i-23]; |
| 907 case 22: sum += qlp_coeff[21] * data[i-22]; |
| 908 case 21: sum += qlp_coeff[20] * data[i-21]; |
| 909 case 20: sum += qlp_coeff[19] * data[i-20]; |
| 910 case 19: sum += qlp_coeff[18] * data[i-19]; |
| 911 case 18: sum += qlp_coeff[17] * data[i-18]; |
| 912 case 17: sum += qlp_coeff[16] * data[i-17]; |
| 913 case 16: sum += qlp_coeff[15] * data[i-16]; |
| 914 case 15: sum += qlp_coeff[14] * data[i-15]; |
| 915 case 14: sum += qlp_coeff[13] * data[i-14]; |
| 916 case 13: sum += qlp_coeff[12] * data[i-13]; |
| 917 sum += qlp_coeff[11] * data[i-12]; |
| 918 sum += qlp_coeff[10] * data[i-11]; |
| 919 sum += qlp_coeff[ 9] * data[i-10]; |
| 920 sum += qlp_coeff[ 8] * data[i- 9]; |
| 921 sum += qlp_coeff[ 7] * data[i- 8]; |
| 922 sum += qlp_coeff[ 6] * data[i- 7]; |
| 923 sum += qlp_coeff[ 5] * data[i- 6]; |
| 924 sum += qlp_coeff[ 4] * data[i- 5]; |
| 925 sum += qlp_coeff[ 3] * data[i- 4]; |
| 926 sum += qlp_coeff[ 2] * data[i- 3]; |
| 927 sum += qlp_coeff[ 1] * data[i- 2]; |
| 928 sum += qlp_coeff[ 0] * data[i- 1]; |
| 929 } |
| 930 residual[i] = data[i] - (sum >> lp_quantization); |
| 931 } |
| 932 } |
| 933 } |
| 934 |
| 935 #if defined FLAC__CPU_IA32 && !defined FLAC__HAS_NASM /* unused for x64; not bet
ter than MMX asm */ |
| 936 |
| 937 FLAC__SSE_TARGET("sse2") |
| 938 void FLAC__lpc_restore_signal_16_intrin_sse2(const FLAC__int32 residual[], unsig
ned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization
, FLAC__int32 data[]) |
| 939 { |
| 940 if (order < 8 || order > 12) { |
| 941 FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, l
p_quantization, data); |
| 942 return; |
| 943 } |
| 944 if (data_len == 0) |
| 945 return; |
| 946 |
| 947 FLAC__ASSERT(order >= 8); |
| 948 FLAC__ASSERT(order <= 12); |
| 949 |
| 950 if(order > 8) { /* order == 9, 10, 11, 12 */ |
| 951 FLAC__int32 curr; |
| 952 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; |
| 953 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0)); |
| 954 xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4)); |
| 955 xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0
to 3 uninitialized coeffs... */ |
| 956 switch(order) /* ...and
zero them out */ |
| 957 { |
| 958 case 9: |
| 959 xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(x
mm1, 12); break; |
| 960 case 10: |
| 961 xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xm
m1, 8); break; |
| 962 case 11: |
| 963 xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xm
m1, 4); break; |
| 964 } |
| 965 xmm2 = _mm_setzero_si128(); |
| 966 xmm0 = _mm_packs_epi32(xmm0, xmm6); |
| 967 xmm1 = _mm_packs_epi32(xmm1, xmm2); |
| 968 |
| 969 xmm4 = _mm_loadu_si128((const __m128i*)(data-12)); |
| 970 xmm5 = _mm_loadu_si128((const __m128i*)(data-8)); |
| 971 xmm3 = _mm_loadu_si128((const __m128i*)(data-4)); |
| 972 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3)); |
| 973 xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3)); |
| 974 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3)); |
| 975 xmm4 = _mm_packs_epi32(xmm4, xmm2); |
| 976 xmm3 = _mm_packs_epi32(xmm3, xmm5); |
| 977 |
| 978 xmm7 = _mm_slli_si128(xmm1, 2); |
| 979 xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14)); |
| 980 xmm2 = _mm_slli_si128(xmm0, 2); |
| 981 |
| 982 /* xmm0, xmm1: qlp_coeff |
| 983 xmm2, xmm7: qlp_coeff << 16 bit |
| 984 xmm3, xmm4: data */ |
| 985 |
| 986 xmm5 = _mm_madd_epi16(xmm4, xmm1); |
| 987 xmm6 = _mm_madd_epi16(xmm3, xmm0); |
| 988 xmm6 = _mm_add_epi32(xmm6, xmm5); |
| 989 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); |
| 990 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); |
| 991 |
| 992 DATA16_RESULT(xmm6); |
| 993 |
| 994 data_len--; |
| 995 |
| 996 if(data_len % 2) { |
| 997 xmm6 = _mm_srli_si128(xmm3, 14); |
| 998 xmm4 = _mm_slli_si128(xmm4, 2); |
| 999 xmm3 = _mm_slli_si128(xmm3, 2); |
| 1000 xmm4 = _mm_or_si128(xmm4, xmm6); |
| 1001 xmm3 = _mm_insert_epi16(xmm3, curr, 0); |
| 1002 |
| 1003 xmm5 = _mm_madd_epi16(xmm4, xmm1); |
| 1004 xmm6 = _mm_madd_epi16(xmm3, xmm0); |
| 1005 xmm6 = _mm_add_epi32(xmm6, xmm5); |
| 1006 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); |
| 1007 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); |
| 1008 |
| 1009 DATA16_RESULT(xmm6); |
| 1010 |
| 1011 data_len--; |
| 1012 } |
| 1013 |
| 1014 while(data_len) { /* data_len is a multiple of 2 */ |
| 1015 /* 1 _mm_slli_si128 per data element less but we need sh
ifted qlp_coeff in xmm2:xmm7 */ |
| 1016 xmm6 = _mm_srli_si128(xmm3, 12); |
| 1017 xmm4 = _mm_slli_si128(xmm4, 4); |
| 1018 xmm3 = _mm_slli_si128(xmm3, 4); |
| 1019 xmm4 = _mm_or_si128(xmm4, xmm6); |
| 1020 xmm3 = _mm_insert_epi16(xmm3, curr, 1); |
| 1021 |
| 1022 xmm5 = _mm_madd_epi16(xmm4, xmm7); |
| 1023 xmm6 = _mm_madd_epi16(xmm3, xmm2); |
| 1024 xmm6 = _mm_add_epi32(xmm6, xmm5); |
| 1025 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); |
| 1026 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); |
| 1027 |
| 1028 DATA16_RESULT(xmm6); |
| 1029 |
| 1030 xmm3 = _mm_insert_epi16(xmm3, curr, 0); |
| 1031 |
| 1032 xmm5 = _mm_madd_epi16(xmm4, xmm1); |
| 1033 xmm6 = _mm_madd_epi16(xmm3, xmm0); |
| 1034 xmm6 = _mm_add_epi32(xmm6, xmm5); |
| 1035 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); |
| 1036 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); |
| 1037 |
| 1038 DATA16_RESULT(xmm6); |
| 1039 |
| 1040 data_len-=2; |
| 1041 } |
| 1042 } /* endif(order > 8) */ |
| 1043 else |
| 1044 { |
| 1045 FLAC__int32 curr; |
| 1046 __m128i xmm0, xmm1, xmm3, xmm6; |
| 1047 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0)); |
| 1048 xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4)); |
| 1049 xmm0 = _mm_packs_epi32(xmm0, xmm1); |
| 1050 |
| 1051 xmm1 = _mm_loadu_si128((const __m128i*)(data-8)); |
| 1052 xmm3 = _mm_loadu_si128((const __m128i*)(data-4)); |
| 1053 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3)); |
| 1054 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3)); |
| 1055 xmm3 = _mm_packs_epi32(xmm3, xmm1); |
| 1056 |
| 1057 /* xmm0: qlp_coeff |
| 1058 xmm3: data */ |
| 1059 |
| 1060 xmm6 = _mm_madd_epi16(xmm3, xmm0); |
| 1061 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); |
| 1062 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); |
| 1063 |
| 1064 DATA16_RESULT(xmm6); |
| 1065 |
| 1066 data_len--; |
| 1067 |
| 1068 while(data_len) { |
| 1069 xmm3 = _mm_slli_si128(xmm3, 2); |
| 1070 xmm3 = _mm_insert_epi16(xmm3, curr, 0); |
| 1071 |
| 1072 xmm6 = _mm_madd_epi16(xmm3, xmm0); |
| 1073 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); |
| 1074 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); |
| 1075 |
| 1076 DATA16_RESULT(xmm6); |
| 1077 |
| 1078 data_len--; |
| 1079 } |
| 1080 } |
| 1081 } |
| 1082 |
| 1083 #endif /* defined FLAC__CPU_IA32 && !defined FLAC__HAS_NASM */ |
| 1084 |
| 1085 #endif /* FLAC__SSE2_SUPPORTED */ |
| 1086 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */ |
| 1087 #endif /* FLAC__NO_ASM */ |
| 1088 #endif /* FLAC__INTEGER_ONLY_LIBRARY */ |
OLD | NEW |