source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c - Issue 341293003: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c

Issue 341293003: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 6 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« no previous file with comments | « source/libvpx/vp9/common/x86/vp9_postproc_sse2.asm ('k') | source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c » ('j') | no next file with comments »

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 93 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
104 for (i = output_height; i > 1; i-=2) {	104 for (i = output_height; i > 1; i-=2) {

105 // load the 2 strides of source	105 // load the 2 strides of source

106 srcReg32b1 = _mm256_castsi128_si256(	106 srcReg32b1 = _mm256_castsi128_si256(

107 _mm_loadu_si128((__m128i *)(src_ptr-3)));	107 _mm_loadu_si128((__m128i *)(src_ptr-3)));

108 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,	108 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,

109 _mm_loadu_si128((__m128i *)	109 _mm_loadu_si128((__m128i *)

110 (src_ptr+src_pixels_per_line-3)), 1);	110 (src_ptr+src_pixels_per_line-3)), 1);

111	111

112 // filter the source buffer	112 // filter the source buffer

113 srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);	113 srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);

114 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);	114 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);

115	115

116 // multiply 2 adjacent elements with the filter and add the result	116 // multiply 2 adjacent elements with the filter and add the result

117 srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);	117 srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);

118 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);	118 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);

119	119

120 // add and saturate the results together	120 // add and saturate the results together

121 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);	121 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);

122	122

123 // filter the source buffer	123 // filter the source buffer

124 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);	124 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);

125 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);	125 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);

126	126

127 // multiply 2 adjacent elements with the filter and add the result	127 // multiply 2 adjacent elements with the filter and add the result

128 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters);	128 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);

129 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);	129 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);

130	130

131 // add and saturate the results together	131 // add and saturate the results together

132 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,	132 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,

133 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));	133 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));

134	134

135 // reading 2 strides of the next 16 bytes	135 // reading 2 strides of the next 16 bytes

136 // (part of it was being read by earlier read)	136 // (part of it was being read by earlier read)

137 srcReg32b2 = _mm256_castsi128_si256(	137 srcReg32b2 = _mm256_castsi128_si256(

138 _mm_loadu_si128((__m128i *)(src_ptr+5)));	138 _mm_loadu_si128((__m128i *)(src_ptr+5)));

139 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,	139 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,

140 _mm_loadu_si128((__m128i *)	140 _mm_loadu_si128((__m128i *)

141 (src_ptr+src_pixels_per_line+5)), 1);	141 (src_ptr+src_pixels_per_line+5)), 1);

142	142

143 // add and saturate the results together	143 // add and saturate the results together

144 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,	144 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,

145 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));	145 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));

146	146

147 // filter the source buffer	147 // filter the source buffer

148 srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);	148 srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);

149 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);	149 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);

150	150

151 // multiply 2 adjacent elements with the filter and add the result	151 // multiply 2 adjacent elements with the filter and add the result

152 srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);	152 srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);

153 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);	153 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);

154	154

155 // add and saturate the results together	155 // add and saturate the results together

156 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);	156 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);

157	157

158 // filter the source buffer	158 // filter the source buffer

159 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt4Reg);	159 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg);

160 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg);	160 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg);

161	161

162 // multiply 2 adjacent elements with the filter and add the result	162 // multiply 2 adjacent elements with the filter and add the result

163 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters);	163 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);

164 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);	164 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);

165	165

166 // add and saturate the results together	166 // add and saturate the results together

167 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,	167 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,

168 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));	168 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));

169 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,	169 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,

170 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));	170 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));

171	171

172	172

173 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);	173 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);

(...skipping 27 matching lines...) Expand all Loading...
201 if (i > 0) {	201 if (i > 0) {

202 __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;	202 __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;

203 __m128i srcRegFilt2, srcRegFilt3;	203 __m128i srcRegFilt2, srcRegFilt3;

204	204

205 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));	205 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));

206	206

207 // filter the source buffer	207 // filter the source buffer

208 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,	208 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,

209 _mm256_castsi256_si128(filt1Reg));	209 _mm256_castsi256_si128(filt1Reg));

210 srcRegFilt2 = _mm_shuffle_epi8(srcReg1,	210 srcRegFilt2 = _mm_shuffle_epi8(srcReg1,

211 _mm256_castsi256_si128(filt2Reg));	211 _mm256_castsi256_si128(filt4Reg));

212	212

213 // multiply 2 adjacent elements with the filter and add the result	213 // multiply 2 adjacent elements with the filter and add the result

214 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,	214 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,

215 _mm256_castsi256_si128(firstFilters));	215 _mm256_castsi256_si128(firstFilters));

216 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,	216 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,

217 _mm256_castsi256_si128(secondFilters));	217 _mm256_castsi256_si128(forthFilters));

218	218

219 // add and saturate the results together	219 // add and saturate the results together

220 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);	220 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);

221	221

222 // filter the source buffer	222 // filter the source buffer

223 srcRegFilt3= _mm_shuffle_epi8(srcReg1,	223 srcRegFilt3= _mm_shuffle_epi8(srcReg1,

224 _mm256_castsi256_si128(filt4Reg));	224 _mm256_castsi256_si128(filt2Reg));

225 srcRegFilt2= _mm_shuffle_epi8(srcReg1,	225 srcRegFilt2= _mm_shuffle_epi8(srcReg1,

226 _mm256_castsi256_si128(filt3Reg));	226 _mm256_castsi256_si128(filt3Reg));

227	227

228 // multiply 2 adjacent elements with the filter and add the result	228 // multiply 2 adjacent elements with the filter and add the result

229 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,	229 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,

230 _mm256_castsi256_si128(forthFilters));	230 _mm256_castsi256_si128(secondFilters));

231 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,	231 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,

232 _mm256_castsi256_si128(thirdFilters));	232 _mm256_castsi256_si128(thirdFilters));

233	233

234 // add and saturate the results together	234 // add and saturate the results together

235 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,	235 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,

236 _mm_min_epi16(srcRegFilt3, srcRegFilt2));	236 _mm_min_epi16(srcRegFilt3, srcRegFilt2));

237	237

238 // reading the next 16 bytes	238 // reading the next 16 bytes

239 // (part of it was being read by earlier read)	239 // (part of it was being read by earlier read)

240 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));	240 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));

241	241

242 // add and saturate the results together	242 // add and saturate the results together

243 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,	243 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,

244 _mm_max_epi16(srcRegFilt3, srcRegFilt2));	244 _mm_max_epi16(srcRegFilt3, srcRegFilt2));

245	245

246 // filter the source buffer	246 // filter the source buffer

247 srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,	247 srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,

248 _mm256_castsi256_si128(filt1Reg));	248 _mm256_castsi256_si128(filt1Reg));

249 srcRegFilt2 = _mm_shuffle_epi8(srcReg2,	249 srcRegFilt2 = _mm_shuffle_epi8(srcReg2,

250 _mm256_castsi256_si128(filt2Reg));	250 _mm256_castsi256_si128(filt4Reg));

251	251

252 // multiply 2 adjacent elements with the filter and add the result	252 // multiply 2 adjacent elements with the filter and add the result

253 srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1,	253 srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1,

254 _mm256_castsi256_si128(firstFilters));	254 _mm256_castsi256_si128(firstFilters));

255 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,	255 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,

256 _mm256_castsi256_si128(secondFilters));	256 _mm256_castsi256_si128(forthFilters));

257	257

258 // add and saturate the results together	258 // add and saturate the results together

259 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);	259 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);

260	260

261 // filter the source buffer	261 // filter the source buffer

262 srcRegFilt3 = _mm_shuffle_epi8(srcReg2,	262 srcRegFilt3 = _mm_shuffle_epi8(srcReg2,

263 _mm256_castsi256_si128(filt4Reg));	263 _mm256_castsi256_si128(filt2Reg));

264 srcRegFilt2 = _mm_shuffle_epi8(srcReg2,	264 srcRegFilt2 = _mm_shuffle_epi8(srcReg2,

265 _mm256_castsi256_si128(filt3Reg));	265 _mm256_castsi256_si128(filt3Reg));

266	266

267 // multiply 2 adjacent elements with the filter and add the result	267 // multiply 2 adjacent elements with the filter and add the result

268 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,	268 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,

269 _mm256_castsi256_si128(forthFilters));	269 _mm256_castsi256_si128(secondFilters));

270 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,	270 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,

271 _mm256_castsi256_si128(thirdFilters));	271 _mm256_castsi256_si128(thirdFilters));

272	272

273 // add and saturate the results together	273 // add and saturate the results together

274 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,	274 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,

275 _mm_min_epi16(srcRegFilt3, srcRegFilt2));	275 _mm_min_epi16(srcRegFilt3, srcRegFilt2));

276 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,	276 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,

277 _mm_max_epi16(srcRegFilt3, srcRegFilt2));	277 _mm_max_epi16(srcRegFilt3, srcRegFilt2));

278	278

279	279

(...skipping 255 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
535	535

536 // shrink to 8 bit each 16 bits, the first lane contain the first	536 // shrink to 8 bit each 16 bits, the first lane contain the first

537 // convolve result and the second lane contain the second convolve	537 // convolve result and the second lane contain the second convolve

538 // result	538 // result

539 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);	539 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);

540	540

541 // save 16 bytes	541 // save 16 bytes

542 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);	542 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);

543 }	543 }

544 }	544 }

OLD	NEW