source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c - Issue 341293003: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c

Issue 341293003: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 6 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 26 matching lines...) Expand all Loading...
37 DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {	37 DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {

38 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14	38 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14

39 };	39 };

40	40

41 void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,	41 void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,

42 unsigned int src_pixels_per_line,	42 unsigned int src_pixels_per_line,

43 unsigned char *output_ptr,	43 unsigned char *output_ptr,

44 unsigned int output_pitch,	44 unsigned int output_pitch,

45 unsigned int output_height,	45 unsigned int output_height,

46 int16_t *filter) {	46 int16_t *filter) {

47 __m128i firstFilters, secondFilters, thirdFilters, forthFilters;	47 __m128i firstFilters, secondFilters, shuffle1, shuffle2;

48 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;	48 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;

49 __m128i addFilterReg64, filtersReg, srcReg, minReg;	49 __m128i addFilterReg64, filtersReg, srcReg, minReg;

50 unsigned int i;	50 unsigned int i;

51	51

52 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64	52 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64

53 addFilterReg64 =_mm_set1_epi32((int)0x0400040u);	53 addFilterReg64 =_mm_set1_epi32((int)0x0400040u);

54 filtersReg = _mm_loadu_si128((__m128i *)filter);	54 filtersReg = _mm_loadu_si128((__m128i *)filter);

55 // converting the 16 bit (short) to 8 bit (byte) and have the same data	55 // converting the 16 bit (short) to 8 bit (byte) and have the same data

56 // in both lanes of 128 bit register.	56 // in both lanes of 128 bit register.

57 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);	57 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);

58	58

59 // duplicate only the first 16 bits in the filter into the first lane	59 // duplicate only the first 16 bits in the filter into the first lane

60 firstFilters = _mm_shufflelo_epi16(filtersReg, 0);	60 firstFilters = _mm_shufflelo_epi16(filtersReg, 0);

61 // duplicate only the third 16 bit in the filter into the first lane	61 // duplicate only the third 16 bit in the filter into the first lane

62 secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);	62 secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);

63 // duplicate only the seconds 16 bits in the filter into the second lane	63 // duplicate only the seconds 16 bits in the filter into the second lane

	64 // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3

64 firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);	65 firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);

65 // duplicate only the forth 16 bits in the filter into the second lane	66 // duplicate only the forth 16 bits in the filter into the second lane

	67 // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7

66 secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);	68 secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);

67	69

68 // loading the local filters	70 // loading the local filters

69 thirdFilters =_mm_load_si128((__m128i const *)filt1_4_h8);	71 shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8);

70 forthFilters = _mm_load_si128((__m128i const *)filt2_4_h8);	72 shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);

71	73

72 for (i = 0; i < output_height; i++) {	74 for (i = 0; i < output_height; i++) {

73 srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));	75 srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));

74	76

75 // filter the source buffer	77 // filter the source buffer

76 srcRegFilt1= _mm_shuffle_epi8(srcReg, thirdFilters);	78 srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1);

77 srcRegFilt2= _mm_shuffle_epi8(srcReg, forthFilters);	79 srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2);

78	80

79 // multiply 2 adjacent elements with the filter and add the result	81 // multiply 2 adjacent elements with the filter and add the result

80 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);	82 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);

81 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);	83 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);

82	84

83 // extract the higher half of the lane	85 // extract the higher half of the lane

84 srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);	86 srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);

85 srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);	87 srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);

86	88

87 minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);	89 minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);

(...skipping 69 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
157	159

158 // filter the source buffer	160 // filter the source buffer

159 srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);	161 srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);

160 srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);	162 srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);

161	163

162 // multiply 2 adjacent elements with the filter and add the result	164 // multiply 2 adjacent elements with the filter and add the result

163 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);	165 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);

164 srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);	166 srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);

165	167

166 // add and saturate all the results together	168 // add and saturate all the results together

167 minReg = _mm_min_epi16(srcRegFilt4, srcRegFilt3);	169 minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);

	170 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);

	171

	172 srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3);

	173 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);

168 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);	174 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);

169

170 srcRegFilt4= _mm_max_epi16(srcRegFilt4, srcRegFilt3);

171 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);

172 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);

173 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);	175 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);

174	176

175 // shift by 7 bit each 16 bits	177 // shift by 7 bit each 16 bits

176 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);	178 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);

177	179

178 // shrink to 8 bit each 16 bits	180 // shrink to 8 bit each 16 bits

179 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);	181 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);

180	182

181 src_ptr+=src_pixels_per_line;	183 src_ptr+=src_pixels_per_line;

182	184

(...skipping 39 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
222 filt1Reg = _mm_load_si128((__m128i const *)filt1_global);	224 filt1Reg = _mm_load_si128((__m128i const *)filt1_global);

223 filt2Reg = _mm_load_si128((__m128i const *)filt2_global);	225 filt2Reg = _mm_load_si128((__m128i const *)filt2_global);

224 filt3Reg = _mm_load_si128((__m128i const *)filt3_global);	226 filt3Reg = _mm_load_si128((__m128i const *)filt3_global);

225 filt4Reg = _mm_load_si128((__m128i const *)filt4_global);	227 filt4Reg = _mm_load_si128((__m128i const *)filt4_global);

226	228

227 for (i = 0; i < output_height; i++) {	229 for (i = 0; i < output_height; i++) {

228 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));	230 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));

229	231

230 // filter the source buffer	232 // filter the source buffer

231 srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);	233 srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);

232 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt2Reg);	234 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg);

233	235

234 // multiply 2 adjacent elements with the filter and add the result	236 // multiply 2 adjacent elements with the filter and add the result

235 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);	237 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);

236 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);	238 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);

237	239

238 // add and saturate the results together	240 // add and saturate the results together

239 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);	241 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);

240	242

241 // filter the source buffer	243 // filter the source buffer

242 srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt4Reg);	244 srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg);

243 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);	245 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);

244	246

245 // multiply 2 adjacent elements with the filter and add the result	247 // multiply 2 adjacent elements with the filter and add the result

246 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);	248 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);

247 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);	249 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);

248	250

249 // add and saturate the results together	251 // add and saturate the results together

250 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,	252 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,

251 _mm_min_epi16(srcRegFilt3, srcRegFilt2));	253 _mm_min_epi16(srcRegFilt3, srcRegFilt2));

252	254

253 // reading the next 16 bytes.	255 // reading the next 16 bytes.

254 // (part of it was being read by earlier read)	256 // (part of it was being read by earlier read)

255 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));	257 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));

256	258

257 // add and saturate the results together	259 // add and saturate the results together

258 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,	260 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,

259 _mm_max_epi16(srcRegFilt3, srcRegFilt2));	261 _mm_max_epi16(srcRegFilt3, srcRegFilt2));

260	262

261 // filter the source buffer	263 // filter the source buffer

262 srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);	264 srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);

263 srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt2Reg);	265 srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg);

264	266

265 // multiply 2 adjacent elements with the filter and add the result	267 // multiply 2 adjacent elements with the filter and add the result

266 srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);	268 srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);

267 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);	269 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);

268	270

269 // add and saturate the results together	271 // add and saturate the results together

270 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);	272 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);

271	273

272 // filter the source buffer	274 // filter the source buffer

273 srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt4Reg);	275 srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg);

274 srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);	276 srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);

275	277

276 // multiply 2 adjacent elements with the filter and add the result	278 // multiply 2 adjacent elements with the filter and add the result

277 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);	279 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);

278 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);	280 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);

279	281

280 // add and saturate the results together	282 // add and saturate the results together

281 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,	283 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,

282 _mm_min_epi16(srcRegFilt3, srcRegFilt2));	284 _mm_min_epi16(srcRegFilt3, srcRegFilt2));

283 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,	285 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,

284 _mm_max_epi16(srcRegFilt3, srcRegFilt2));	286 _mm_max_epi16(srcRegFilt3, srcRegFilt2));

285	287

286 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);	288 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);

287 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64);	289 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64);

(...skipping 193 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
481 srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);	483 srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);

482	484

483 src_ptr+=src_pitch;	485 src_ptr+=src_pitch;

484	486

485 // save 16 bytes convolve result	487 // save 16 bytes convolve result

486 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);	488 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);

487	489

488 output_ptr+=out_pitch;	490 output_ptr+=out_pitch;

489 }	491 }

490 }	492 }

OLD	NEW