Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(382)

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c

Issue 1015483002: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c ('k') | source/libvpx/vp9/vp9_cx_iface.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after
95 const __m128i q6 = _mm_sub_epi16(in1, in6); 95 const __m128i q6 = _mm_sub_epi16(in1, in6);
96 const __m128i q7 = _mm_sub_epi16(in0, in7); 96 const __m128i q7 = _mm_sub_epi16(in0, in7);
97 // Work on first four results 97 // Work on first four results
98 { 98 {
99 // Add/subtract 99 // Add/subtract
100 const __m128i r0 = _mm_add_epi16(q0, q3); 100 const __m128i r0 = _mm_add_epi16(q0, q3);
101 const __m128i r1 = _mm_add_epi16(q1, q2); 101 const __m128i r1 = _mm_add_epi16(q1, q2);
102 const __m128i r2 = _mm_sub_epi16(q1, q2); 102 const __m128i r2 = _mm_sub_epi16(q1, q2);
103 const __m128i r3 = _mm_sub_epi16(q0, q3); 103 const __m128i r3 = _mm_sub_epi16(q0, q3);
104 // Interleave to do the multiply by constants which gets us into 32bits 104 // Interleave to do the multiply by constants which gets us into 32bits
105 const __m128i t0 = _mm_add_epi16(r0, r1); 105 const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
106 const __m128i t1 = _mm_sub_epi16(r0, r1); 106 const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
107 const __m128i t2 = _mm_unpacklo_epi16(r2, r3); 107 const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
108 const __m128i t3 = _mm_unpackhi_epi16(r2, r3); 108 const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
109 109
110 const __m128i u0 = _mm_mulhrs_epi16(t0, k__dual_p16_p16); 110 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
111 const __m128i u1 = _mm_mulhrs_epi16(t1, k__dual_p16_p16); 111 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
112 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
113 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
114
112 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); 115 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
113 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); 116 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
114 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); 117 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
115 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); 118 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
116 // dct_const_round_shift 119 // dct_const_round_shift
120
121 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
122 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
123 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
124 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
125
117 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 126 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
118 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 127 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
119 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 128 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
120 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 129 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
130
131 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
132 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
133 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
134 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
135
121 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 136 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
122 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 137 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
123 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 138 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
124 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 139 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
125 // Combine 140 // Combine
126 res0 = u0; 141
127 res4 = u1; 142 res0 = _mm_packs_epi32(w0, w1);
143 res4 = _mm_packs_epi32(w2, w3);
128 res2 = _mm_packs_epi32(w4, w5); 144 res2 = _mm_packs_epi32(w4, w5);
129 res6 = _mm_packs_epi32(w6, w7); 145 res6 = _mm_packs_epi32(w6, w7);
130 } 146 }
131 // Work on next four results 147 // Work on next four results
132 if (pass == 1) { 148 {
133 // Interleave to do the multiply by constants which gets us into 32bits
134 const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
135 const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
136 const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
137 const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
138 const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
139 const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
140 // dct_const_round_shift
141 const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
142 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
143 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
144 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
145 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
146 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
147 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
148 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
149 // Combine
150 const __m128i r0 = _mm_packs_epi32(s0, s1);
151 const __m128i r1 = _mm_packs_epi32(s2, s3);
152 // Add/subtract
153 const __m128i x0 = _mm_add_epi16(q4, r0);
154 const __m128i x1 = _mm_sub_epi16(q4, r0);
155 const __m128i x2 = _mm_sub_epi16(q7, r1);
156 const __m128i x3 = _mm_add_epi16(q7, r1);
157 // Interleave to do the multiply by constants which gets us into 32bits
158 const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
159 const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
160 const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
161 const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
162 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
163 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
164 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
165 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
166 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
167 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
168 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
169 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
170 // dct_const_round_shift
171 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
172 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
173 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
174 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
175 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
176 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
177 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
178 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
179 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
180 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
181 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
182 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
183 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
184 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
185 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
186 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
187 // Combine
188 res1 = _mm_packs_epi32(w0, w1);
189 res7 = _mm_packs_epi32(w2, w3);
190 res5 = _mm_packs_epi32(w4, w5);
191 res3 = _mm_packs_epi32(w6, w7);
192 } else {
193 // Interleave to do the multiply by constants which gets us into 32bits 149 // Interleave to do the multiply by constants which gets us into 32bits
194 const __m128i d0 = _mm_sub_epi16(q6, q5); 150 const __m128i d0 = _mm_sub_epi16(q6, q5);
195 const __m128i d1 = _mm_add_epi16(q6, q5); 151 const __m128i d1 = _mm_add_epi16(q6, q5);
196 const __m128i r0 = _mm_mulhrs_epi16(d0, k__dual_p16_p16); 152 const __m128i r0 = _mm_mulhrs_epi16(d0, k__dual_p16_p16);
197 const __m128i r1 = _mm_mulhrs_epi16(d1, k__dual_p16_p16); 153 const __m128i r1 = _mm_mulhrs_epi16(d1, k__dual_p16_p16);
154
198 // Add/subtract 155 // Add/subtract
199 const __m128i x0 = _mm_add_epi16(q4, r0); 156 const __m128i x0 = _mm_add_epi16(q4, r0);
200 const __m128i x1 = _mm_sub_epi16(q4, r0); 157 const __m128i x1 = _mm_sub_epi16(q4, r0);
201 const __m128i x2 = _mm_sub_epi16(q7, r1); 158 const __m128i x2 = _mm_sub_epi16(q7, r1);
202 const __m128i x3 = _mm_add_epi16(q7, r1); 159 const __m128i x3 = _mm_add_epi16(q7, r1);
203 // Interleave to do the multiply by constants which gets us into 32bits 160 // Interleave to do the multiply by constants which gets us into 32bits
204 const __m128i t0 = _mm_unpacklo_epi16(x0, x3); 161 const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
205 const __m128i t1 = _mm_unpackhi_epi16(x0, x3); 162 const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
206 const __m128i t2 = _mm_unpacklo_epi16(x1, x2); 163 const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
207 const __m128i t3 = _mm_unpackhi_epi16(x1, x2); 164 const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
(...skipping 280 matching lines...) Expand 10 before | Expand all | Expand 10 after
488 do { 445 do {
489 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); 446 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
490 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); 447 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
491 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); 448 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
492 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); 449 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
493 n_coeffs += 8 * 2; 450 n_coeffs += 8 * 2;
494 } while (n_coeffs < 0); 451 } while (n_coeffs < 0);
495 *eob_ptr = 0; 452 *eob_ptr = 0;
496 } 453 }
497 } 454 }
OLDNEW
« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c ('k') | source/libvpx/vp9/vp9_cx_iface.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698