source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c - Issue 1015483002: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c

Issue 1015483002: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: Created 5 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 84 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
95 const __m128i q6 = _mm_sub_epi16(in1, in6);	95 const __m128i q6 = _mm_sub_epi16(in1, in6);

96 const __m128i q7 = _mm_sub_epi16(in0, in7);	96 const __m128i q7 = _mm_sub_epi16(in0, in7);

97 // Work on first four results	97 // Work on first four results

98 {	98 {

99 // Add/subtract	99 // Add/subtract

100 const __m128i r0 = _mm_add_epi16(q0, q3);	100 const __m128i r0 = _mm_add_epi16(q0, q3);

101 const __m128i r1 = _mm_add_epi16(q1, q2);	101 const __m128i r1 = _mm_add_epi16(q1, q2);

102 const __m128i r2 = _mm_sub_epi16(q1, q2);	102 const __m128i r2 = _mm_sub_epi16(q1, q2);

103 const __m128i r3 = _mm_sub_epi16(q0, q3);	103 const __m128i r3 = _mm_sub_epi16(q0, q3);

104 // Interleave to do the multiply by constants which gets us into 32bits	104 // Interleave to do the multiply by constants which gets us into 32bits

105 const __m128i t0 = _mm_add_epi16(r0, r1);	105 const __m128i t0 = _mm_unpacklo_epi16(r0, r1);

106 const __m128i t1 = _mm_sub_epi16(r0, r1);	106 const __m128i t1 = _mm_unpackhi_epi16(r0, r1);

107 const __m128i t2 = _mm_unpacklo_epi16(r2, r3);	107 const __m128i t2 = _mm_unpacklo_epi16(r2, r3);

108 const __m128i t3 = _mm_unpackhi_epi16(r2, r3);	108 const __m128i t3 = _mm_unpackhi_epi16(r2, r3);

109	109

110 const __m128i u0 = _mm_mulhrs_epi16(t0, k__dual_p16_p16);	110 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);

111 const __m128i u1 = _mm_mulhrs_epi16(t1, k__dual_p16_p16);	111 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);

	112 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);

	113 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);

	114

112 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);	115 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);

113 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);	116 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);

114 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);	117 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);

115 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);	118 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);

116 // dct_const_round_shift	119 // dct_const_round_shift

	120

	121 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);

	122 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);

	123 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);

	124 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);

	125

117 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);	126 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);

118 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);	127 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);

119 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);	128 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);

120 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);	129 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);

	130

	131 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);

	132 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);

	133 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);

	134 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);

	135

121 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);	136 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);

122 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);	137 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);

123 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);	138 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);

124 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);	139 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);

125 // Combine	140 // Combine

126 res0 = u0;	141

127 res4 = u1;	142 res0 = _mm_packs_epi32(w0, w1);

	143 res4 = _mm_packs_epi32(w2, w3);

128 res2 = _mm_packs_epi32(w4, w5);	144 res2 = _mm_packs_epi32(w4, w5);

129 res6 = _mm_packs_epi32(w6, w7);	145 res6 = _mm_packs_epi32(w6, w7);

130 }	146 }

131 // Work on next four results	147 // Work on next four results

132 if (pass == 1) {	148 {

133 // Interleave to do the multiply by constants which gets us into 32bits

134 const __m128i d0 = _mm_unpacklo_epi16(q6, q5);

135 const __m128i d1 = _mm_unpackhi_epi16(q6, q5);

136 const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);

137 const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);

138 const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);

139 const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);

140 // dct_const_round_shift

141 const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);

142 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);

143 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);

144 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);

145 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);

146 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);

147 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);

148 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);

149 // Combine

150 const __m128i r0 = _mm_packs_epi32(s0, s1);

151 const __m128i r1 = _mm_packs_epi32(s2, s3);

152 // Add/subtract

153 const __m128i x0 = _mm_add_epi16(q4, r0);

154 const __m128i x1 = _mm_sub_epi16(q4, r0);

155 const __m128i x2 = _mm_sub_epi16(q7, r1);

156 const __m128i x3 = _mm_add_epi16(q7, r1);

157 // Interleave to do the multiply by constants which gets us into 32bits

158 const __m128i t0 = _mm_unpacklo_epi16(x0, x3);

159 const __m128i t1 = _mm_unpackhi_epi16(x0, x3);

160 const __m128i t2 = _mm_unpacklo_epi16(x1, x2);

161 const __m128i t3 = _mm_unpackhi_epi16(x1, x2);

162 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);

163 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);

164 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);

165 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);

166 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);

167 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);

168 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);

169 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);

170 // dct_const_round_shift

171 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);

172 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);

173 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);

174 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);

175 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);

176 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);

177 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);

178 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);

179 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);

180 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);

181 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);

182 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);

183 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);

184 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);

185 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);

186 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);

187 // Combine

188 res1 = _mm_packs_epi32(w0, w1);

189 res7 = _mm_packs_epi32(w2, w3);

190 res5 = _mm_packs_epi32(w4, w5);

191 res3 = _mm_packs_epi32(w6, w7);

192 } else {

193 // Interleave to do the multiply by constants which gets us into 32bits	149 // Interleave to do the multiply by constants which gets us into 32bits

194 const __m128i d0 = _mm_sub_epi16(q6, q5);	150 const __m128i d0 = _mm_sub_epi16(q6, q5);

195 const __m128i d1 = _mm_add_epi16(q6, q5);	151 const __m128i d1 = _mm_add_epi16(q6, q5);

196 const __m128i r0 = _mm_mulhrs_epi16(d0, k__dual_p16_p16);	152 const __m128i r0 = _mm_mulhrs_epi16(d0, k__dual_p16_p16);

197 const __m128i r1 = _mm_mulhrs_epi16(d1, k__dual_p16_p16);	153 const __m128i r1 = _mm_mulhrs_epi16(d1, k__dual_p16_p16);

	154

198 // Add/subtract	155 // Add/subtract

199 const __m128i x0 = _mm_add_epi16(q4, r0);	156 const __m128i x0 = _mm_add_epi16(q4, r0);

200 const __m128i x1 = _mm_sub_epi16(q4, r0);	157 const __m128i x1 = _mm_sub_epi16(q4, r0);

201 const __m128i x2 = _mm_sub_epi16(q7, r1);	158 const __m128i x2 = _mm_sub_epi16(q7, r1);

202 const __m128i x3 = _mm_add_epi16(q7, r1);	159 const __m128i x3 = _mm_add_epi16(q7, r1);

203 // Interleave to do the multiply by constants which gets us into 32bits	160 // Interleave to do the multiply by constants which gets us into 32bits

204 const __m128i t0 = _mm_unpacklo_epi16(x0, x3);	161 const __m128i t0 = _mm_unpacklo_epi16(x0, x3);

205 const __m128i t1 = _mm_unpackhi_epi16(x0, x3);	162 const __m128i t1 = _mm_unpackhi_epi16(x0, x3);

206 const __m128i t2 = _mm_unpacklo_epi16(x1, x2);	163 const __m128i t2 = _mm_unpacklo_epi16(x1, x2);

207 const __m128i t3 = _mm_unpackhi_epi16(x1, x2);	164 const __m128i t3 = _mm_unpackhi_epi16(x1, x2);

(...skipping 280 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
488 do {	445 do {

489 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);	446 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);

490 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);	447 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);

491 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);	448 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);

492 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);	449 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);

493 n_coeffs += 8 * 2;	450 n_coeffs += 8 * 2;

494 } while (n_coeffs < 0);	451 } while (n_coeffs < 0);

495 *eob_ptr = 0;	452 *eob_ptr = 0;

496 }	453 }

497 }	454 }

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c ('k') | source/libvpx/vp9/vp9_cx_iface.c » ('j') | no next file with comments »