| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 95 const __m128i q6 = _mm_sub_epi16(in1, in6); | 95 const __m128i q6 = _mm_sub_epi16(in1, in6); |
| 96 const __m128i q7 = _mm_sub_epi16(in0, in7); | 96 const __m128i q7 = _mm_sub_epi16(in0, in7); |
| 97 // Work on first four results | 97 // Work on first four results |
| 98 { | 98 { |
| 99 // Add/subtract | 99 // Add/subtract |
| 100 const __m128i r0 = _mm_add_epi16(q0, q3); | 100 const __m128i r0 = _mm_add_epi16(q0, q3); |
| 101 const __m128i r1 = _mm_add_epi16(q1, q2); | 101 const __m128i r1 = _mm_add_epi16(q1, q2); |
| 102 const __m128i r2 = _mm_sub_epi16(q1, q2); | 102 const __m128i r2 = _mm_sub_epi16(q1, q2); |
| 103 const __m128i r3 = _mm_sub_epi16(q0, q3); | 103 const __m128i r3 = _mm_sub_epi16(q0, q3); |
| 104 // Interleave to do the multiply by constants which gets us into 32bits | 104 // Interleave to do the multiply by constants which gets us into 32bits |
| 105 const __m128i t0 = _mm_add_epi16(r0, r1); | 105 const __m128i t0 = _mm_unpacklo_epi16(r0, r1); |
| 106 const __m128i t1 = _mm_sub_epi16(r0, r1); | 106 const __m128i t1 = _mm_unpackhi_epi16(r0, r1); |
| 107 const __m128i t2 = _mm_unpacklo_epi16(r2, r3); | 107 const __m128i t2 = _mm_unpacklo_epi16(r2, r3); |
| 108 const __m128i t3 = _mm_unpackhi_epi16(r2, r3); | 108 const __m128i t3 = _mm_unpackhi_epi16(r2, r3); |
| 109 | 109 |
| 110 const __m128i u0 = _mm_mulhrs_epi16(t0, k__dual_p16_p16); | 110 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); |
| 111 const __m128i u1 = _mm_mulhrs_epi16(t1, k__dual_p16_p16); | 111 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); |
| 112 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); |
| 113 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); |
| 114 |
| 112 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); | 115 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); |
| 113 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); | 116 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); |
| 114 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); | 117 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); |
| 115 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); | 118 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); |
| 116 // dct_const_round_shift | 119 // dct_const_round_shift |
| 120 |
| 121 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); |
| 122 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); |
| 123 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); |
| 124 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); |
| 125 |
| 117 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); | 126 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); |
| 118 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); | 127 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); |
| 119 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); | 128 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); |
| 120 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); | 129 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); |
| 130 |
| 131 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); |
| 132 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); |
| 133 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); |
| 134 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); |
| 135 |
| 121 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); | 136 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); |
| 122 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); | 137 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); |
| 123 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); | 138 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); |
| 124 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); | 139 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); |
| 125 // Combine | 140 // Combine |
| 126 res0 = u0; | 141 |
| 127 res4 = u1; | 142 res0 = _mm_packs_epi32(w0, w1); |
| 143 res4 = _mm_packs_epi32(w2, w3); |
| 128 res2 = _mm_packs_epi32(w4, w5); | 144 res2 = _mm_packs_epi32(w4, w5); |
| 129 res6 = _mm_packs_epi32(w6, w7); | 145 res6 = _mm_packs_epi32(w6, w7); |
| 130 } | 146 } |
| 131 // Work on next four results | 147 // Work on next four results |
| 132 if (pass == 1) { | 148 { |
| 133 // Interleave to do the multiply by constants which gets us into 32bits | |
| 134 const __m128i d0 = _mm_unpacklo_epi16(q6, q5); | |
| 135 const __m128i d1 = _mm_unpackhi_epi16(q6, q5); | |
| 136 const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); | |
| 137 const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); | |
| 138 const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); | |
| 139 const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); | |
| 140 // dct_const_round_shift | |
| 141 const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); | |
| 142 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); | |
| 143 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); | |
| 144 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); | |
| 145 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); | |
| 146 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); | |
| 147 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); | |
| 148 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); | |
| 149 // Combine | |
| 150 const __m128i r0 = _mm_packs_epi32(s0, s1); | |
| 151 const __m128i r1 = _mm_packs_epi32(s2, s3); | |
| 152 // Add/subtract | |
| 153 const __m128i x0 = _mm_add_epi16(q4, r0); | |
| 154 const __m128i x1 = _mm_sub_epi16(q4, r0); | |
| 155 const __m128i x2 = _mm_sub_epi16(q7, r1); | |
| 156 const __m128i x3 = _mm_add_epi16(q7, r1); | |
| 157 // Interleave to do the multiply by constants which gets us into 32bits | |
| 158 const __m128i t0 = _mm_unpacklo_epi16(x0, x3); | |
| 159 const __m128i t1 = _mm_unpackhi_epi16(x0, x3); | |
| 160 const __m128i t2 = _mm_unpacklo_epi16(x1, x2); | |
| 161 const __m128i t3 = _mm_unpackhi_epi16(x1, x2); | |
| 162 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); | |
| 163 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); | |
| 164 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); | |
| 165 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); | |
| 166 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); | |
| 167 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); | |
| 168 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); | |
| 169 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); | |
| 170 // dct_const_round_shift | |
| 171 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); | |
| 172 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); | |
| 173 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); | |
| 174 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); | |
| 175 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); | |
| 176 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); | |
| 177 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); | |
| 178 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); | |
| 179 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); | |
| 180 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); | |
| 181 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); | |
| 182 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); | |
| 183 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); | |
| 184 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); | |
| 185 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); | |
| 186 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); | |
| 187 // Combine | |
| 188 res1 = _mm_packs_epi32(w0, w1); | |
| 189 res7 = _mm_packs_epi32(w2, w3); | |
| 190 res5 = _mm_packs_epi32(w4, w5); | |
| 191 res3 = _mm_packs_epi32(w6, w7); | |
| 192 } else { | |
| 193 // Interleave to do the multiply by constants which gets us into 32bits | 149 // Interleave to do the multiply by constants which gets us into 32bits |
| 194 const __m128i d0 = _mm_sub_epi16(q6, q5); | 150 const __m128i d0 = _mm_sub_epi16(q6, q5); |
| 195 const __m128i d1 = _mm_add_epi16(q6, q5); | 151 const __m128i d1 = _mm_add_epi16(q6, q5); |
| 196 const __m128i r0 = _mm_mulhrs_epi16(d0, k__dual_p16_p16); | 152 const __m128i r0 = _mm_mulhrs_epi16(d0, k__dual_p16_p16); |
| 197 const __m128i r1 = _mm_mulhrs_epi16(d1, k__dual_p16_p16); | 153 const __m128i r1 = _mm_mulhrs_epi16(d1, k__dual_p16_p16); |
| 154 |
| 198 // Add/subtract | 155 // Add/subtract |
| 199 const __m128i x0 = _mm_add_epi16(q4, r0); | 156 const __m128i x0 = _mm_add_epi16(q4, r0); |
| 200 const __m128i x1 = _mm_sub_epi16(q4, r0); | 157 const __m128i x1 = _mm_sub_epi16(q4, r0); |
| 201 const __m128i x2 = _mm_sub_epi16(q7, r1); | 158 const __m128i x2 = _mm_sub_epi16(q7, r1); |
| 202 const __m128i x3 = _mm_add_epi16(q7, r1); | 159 const __m128i x3 = _mm_add_epi16(q7, r1); |
| 203 // Interleave to do the multiply by constants which gets us into 32bits | 160 // Interleave to do the multiply by constants which gets us into 32bits |
| 204 const __m128i t0 = _mm_unpacklo_epi16(x0, x3); | 161 const __m128i t0 = _mm_unpacklo_epi16(x0, x3); |
| 205 const __m128i t1 = _mm_unpackhi_epi16(x0, x3); | 162 const __m128i t1 = _mm_unpackhi_epi16(x0, x3); |
| 206 const __m128i t2 = _mm_unpacklo_epi16(x1, x2); | 163 const __m128i t2 = _mm_unpacklo_epi16(x1, x2); |
| 207 const __m128i t3 = _mm_unpackhi_epi16(x1, x2); | 164 const __m128i t3 = _mm_unpackhi_epi16(x1, x2); |
| (...skipping 280 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 488 do { | 445 do { |
| 489 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); | 446 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); |
| 490 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); | 447 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); |
| 491 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); | 448 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); |
| 492 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); | 449 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); |
| 493 n_coeffs += 8 * 2; | 450 n_coeffs += 8 * 2; |
| 494 } while (n_coeffs < 0); | 451 } while (n_coeffs < 0); |
| 495 *eob_ptr = 0; | 452 *eob_ptr = 0; |
| 496 } | 453 } |
| 497 } | 454 } |
| OLD | NEW |