OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
95 const __m128i q6 = _mm_sub_epi16(in1, in6); | 95 const __m128i q6 = _mm_sub_epi16(in1, in6); |
96 const __m128i q7 = _mm_sub_epi16(in0, in7); | 96 const __m128i q7 = _mm_sub_epi16(in0, in7); |
97 // Work on first four results | 97 // Work on first four results |
98 { | 98 { |
99 // Add/subtract | 99 // Add/subtract |
100 const __m128i r0 = _mm_add_epi16(q0, q3); | 100 const __m128i r0 = _mm_add_epi16(q0, q3); |
101 const __m128i r1 = _mm_add_epi16(q1, q2); | 101 const __m128i r1 = _mm_add_epi16(q1, q2); |
102 const __m128i r2 = _mm_sub_epi16(q1, q2); | 102 const __m128i r2 = _mm_sub_epi16(q1, q2); |
103 const __m128i r3 = _mm_sub_epi16(q0, q3); | 103 const __m128i r3 = _mm_sub_epi16(q0, q3); |
104 // Interleave to do the multiply by constants which gets us into 32bits | 104 // Interleave to do the multiply by constants which gets us into 32bits |
105 const __m128i t0 = _mm_add_epi16(r0, r1); | 105 const __m128i t0 = _mm_unpacklo_epi16(r0, r1); |
106 const __m128i t1 = _mm_sub_epi16(r0, r1); | 106 const __m128i t1 = _mm_unpackhi_epi16(r0, r1); |
107 const __m128i t2 = _mm_unpacklo_epi16(r2, r3); | 107 const __m128i t2 = _mm_unpacklo_epi16(r2, r3); |
108 const __m128i t3 = _mm_unpackhi_epi16(r2, r3); | 108 const __m128i t3 = _mm_unpackhi_epi16(r2, r3); |
109 | 109 |
110 const __m128i u0 = _mm_mulhrs_epi16(t0, k__dual_p16_p16); | 110 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); |
111 const __m128i u1 = _mm_mulhrs_epi16(t1, k__dual_p16_p16); | 111 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); |
| 112 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); |
| 113 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); |
| 114 |
112 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); | 115 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); |
113 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); | 116 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); |
114 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); | 117 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); |
115 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); | 118 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); |
116 // dct_const_round_shift | 119 // dct_const_round_shift |
| 120 |
| 121 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); |
| 122 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); |
| 123 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); |
| 124 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); |
| 125 |
117 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); | 126 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); |
118 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); | 127 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); |
119 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); | 128 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); |
120 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); | 129 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); |
| 130 |
| 131 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); |
| 132 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); |
| 133 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); |
| 134 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); |
| 135 |
121 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); | 136 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); |
122 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); | 137 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); |
123 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); | 138 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); |
124 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); | 139 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); |
125 // Combine | 140 // Combine |
126 res0 = u0; | 141 |
127 res4 = u1; | 142 res0 = _mm_packs_epi32(w0, w1); |
| 143 res4 = _mm_packs_epi32(w2, w3); |
128 res2 = _mm_packs_epi32(w4, w5); | 144 res2 = _mm_packs_epi32(w4, w5); |
129 res6 = _mm_packs_epi32(w6, w7); | 145 res6 = _mm_packs_epi32(w6, w7); |
130 } | 146 } |
131 // Work on next four results | 147 // Work on next four results |
132 if (pass == 1) { | 148 { |
133 // Interleave to do the multiply by constants which gets us into 32bits | |
134 const __m128i d0 = _mm_unpacklo_epi16(q6, q5); | |
135 const __m128i d1 = _mm_unpackhi_epi16(q6, q5); | |
136 const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); | |
137 const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); | |
138 const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); | |
139 const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); | |
140 // dct_const_round_shift | |
141 const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); | |
142 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); | |
143 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); | |
144 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); | |
145 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); | |
146 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); | |
147 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); | |
148 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); | |
149 // Combine | |
150 const __m128i r0 = _mm_packs_epi32(s0, s1); | |
151 const __m128i r1 = _mm_packs_epi32(s2, s3); | |
152 // Add/subtract | |
153 const __m128i x0 = _mm_add_epi16(q4, r0); | |
154 const __m128i x1 = _mm_sub_epi16(q4, r0); | |
155 const __m128i x2 = _mm_sub_epi16(q7, r1); | |
156 const __m128i x3 = _mm_add_epi16(q7, r1); | |
157 // Interleave to do the multiply by constants which gets us into 32bits | |
158 const __m128i t0 = _mm_unpacklo_epi16(x0, x3); | |
159 const __m128i t1 = _mm_unpackhi_epi16(x0, x3); | |
160 const __m128i t2 = _mm_unpacklo_epi16(x1, x2); | |
161 const __m128i t3 = _mm_unpackhi_epi16(x1, x2); | |
162 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); | |
163 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); | |
164 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); | |
165 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); | |
166 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); | |
167 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); | |
168 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); | |
169 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); | |
170 // dct_const_round_shift | |
171 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); | |
172 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); | |
173 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); | |
174 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); | |
175 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); | |
176 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); | |
177 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); | |
178 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); | |
179 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); | |
180 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); | |
181 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); | |
182 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); | |
183 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); | |
184 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); | |
185 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); | |
186 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); | |
187 // Combine | |
188 res1 = _mm_packs_epi32(w0, w1); | |
189 res7 = _mm_packs_epi32(w2, w3); | |
190 res5 = _mm_packs_epi32(w4, w5); | |
191 res3 = _mm_packs_epi32(w6, w7); | |
192 } else { | |
193 // Interleave to do the multiply by constants which gets us into 32bits | 149 // Interleave to do the multiply by constants which gets us into 32bits |
194 const __m128i d0 = _mm_sub_epi16(q6, q5); | 150 const __m128i d0 = _mm_sub_epi16(q6, q5); |
195 const __m128i d1 = _mm_add_epi16(q6, q5); | 151 const __m128i d1 = _mm_add_epi16(q6, q5); |
196 const __m128i r0 = _mm_mulhrs_epi16(d0, k__dual_p16_p16); | 152 const __m128i r0 = _mm_mulhrs_epi16(d0, k__dual_p16_p16); |
197 const __m128i r1 = _mm_mulhrs_epi16(d1, k__dual_p16_p16); | 153 const __m128i r1 = _mm_mulhrs_epi16(d1, k__dual_p16_p16); |
| 154 |
198 // Add/subtract | 155 // Add/subtract |
199 const __m128i x0 = _mm_add_epi16(q4, r0); | 156 const __m128i x0 = _mm_add_epi16(q4, r0); |
200 const __m128i x1 = _mm_sub_epi16(q4, r0); | 157 const __m128i x1 = _mm_sub_epi16(q4, r0); |
201 const __m128i x2 = _mm_sub_epi16(q7, r1); | 158 const __m128i x2 = _mm_sub_epi16(q7, r1); |
202 const __m128i x3 = _mm_add_epi16(q7, r1); | 159 const __m128i x3 = _mm_add_epi16(q7, r1); |
203 // Interleave to do the multiply by constants which gets us into 32bits | 160 // Interleave to do the multiply by constants which gets us into 32bits |
204 const __m128i t0 = _mm_unpacklo_epi16(x0, x3); | 161 const __m128i t0 = _mm_unpacklo_epi16(x0, x3); |
205 const __m128i t1 = _mm_unpackhi_epi16(x0, x3); | 162 const __m128i t1 = _mm_unpackhi_epi16(x0, x3); |
206 const __m128i t2 = _mm_unpacklo_epi16(x1, x2); | 163 const __m128i t2 = _mm_unpacklo_epi16(x1, x2); |
207 const __m128i t3 = _mm_unpackhi_epi16(x1, x2); | 164 const __m128i t3 = _mm_unpackhi_epi16(x1, x2); |
(...skipping 280 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
488 do { | 445 do { |
489 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); | 446 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); |
490 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); | 447 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); |
491 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); | 448 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); |
492 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); | 449 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); |
493 n_coeffs += 8 * 2; | 450 n_coeffs += 8 * 2; |
494 } while (n_coeffs < 0); | 451 } while (n_coeffs < 0); |
495 *eob_ptr = 0; | 452 *eob_ptr = 0; |
496 } | 453 } |
497 } | 454 } |
OLD | NEW |