source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c - Issue 1162573005: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c

Issue 1162573005: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

	11 // Due to a header conflict between math.h and intrinsics includes with ceil()

	12 // in certain configurations under vs9 this include needs to precede

	13 // immintrin.h.

	14 #include "./vp9_rtcd.h"

	15

11 #include <immintrin.h>	16 #include <immintrin.h>

	17

	18 #include "vp9/common/x86/convolve.h"

12 #include "vpx_ports/mem.h"	19 #include "vpx_ports/mem.h"

13	20

14 // filters for 16_h8 and 16_v8	21 // filters for 16_h8 and 16_v8

15 DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {	22 DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {

16 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,	23 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,

17 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8	24 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8

18 };	25 };

19	26

20 DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {	27 DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {

21 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,	28 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,

(...skipping 24 matching lines...) Expand all Loading...
46 _mm_broadcastsi128_si256((__m128i const *)&(x))	53 _mm_broadcastsi128_si256((__m128i const *)&(x))

47 # elif __GNUC__ == 4 && __GNUC_MINOR__ == 7	54 # elif __GNUC__ == 4 && __GNUC_MINOR__ == 7

48 # define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)	55 # define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)

49 # else // gcc > 4.7	56 # else // gcc > 4.7

50 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)	57 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)

51 # endif // gcc <= 4.6	58 # endif // gcc <= 4.6

52 #else // !(gcc \|\| clang)	59 #else // !(gcc \|\| clang)

53 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)	60 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)

54 #endif // __clang__	61 #endif // __clang__

55	62

56 void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,	63 static void vp9_filter_block1d16_h8_avx2(const uint8_t *src_ptr,

57 unsigned int src_pixels_per_line,	64 ptrdiff_t src_pixels_per_line,

58 unsigned char *output_ptr,	65 uint8_t *output_ptr,

59 unsigned int output_pitch,	66 ptrdiff_t output_pitch,

60 unsigned int output_height,	67 uint32_t output_height,

61 int16_t *filter) {	68 const int16_t *filter) {

62 __m128i filtersReg;	69 __m128i filtersReg;

63 __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;	70 __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;

64 __m256i firstFilters, secondFilters, thirdFilters, forthFilters;	71 __m256i firstFilters, secondFilters, thirdFilters, forthFilters;

65 __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;	72 __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;

66 __m256i srcReg32b1, srcReg32b2, filtersReg32;	73 __m256i srcReg32b1, srcReg32b2, filtersReg32;

67 unsigned int i;	74 unsigned int i;

68 unsigned int src_stride, dst_stride;	75 ptrdiff_t src_stride, dst_stride;

69	76

70 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64	77 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64

71 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);	78 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);

72 filtersReg = _mm_loadu_si128((__m128i *)filter);	79 filtersReg = _mm_loadu_si128((const __m128i *)filter);

73 // converting the 16 bit (short) to 8 bit (byte) and have the same data	80 // converting the 16 bit (short) to 8 bit (byte) and have the same data

74 // in both lanes of 128 bit register.	81 // in both lanes of 128 bit register.

75 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);	82 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);

76 // have the same data in both lanes of a 256 bit register	83 // have the same data in both lanes of a 256 bit register

77 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);	84 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);

78	85

79 // duplicate only the first 16 bits (first and second byte)	86 // duplicate only the first 16 bits (first and second byte)

80 // across 256 bit register	87 // across 256 bit register

81 firstFilters = _mm256_shuffle_epi8(filtersReg32,	88 firstFilters = _mm256_shuffle_epi8(filtersReg32,

82 _mm256_set1_epi16(0x100u));	89 _mm256_set1_epi16(0x100u));

(...skipping 14 matching lines...) Expand all Loading...
97 filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);	104 filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);

98 filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);	105 filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);

99 filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);	106 filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);

100	107

101 // multiple the size of the source and destination stride by two	108 // multiple the size of the source and destination stride by two

102 src_stride = src_pixels_per_line << 1;	109 src_stride = src_pixels_per_line << 1;

103 dst_stride = output_pitch << 1;	110 dst_stride = output_pitch << 1;

104 for (i = output_height; i > 1; i-=2) {	111 for (i = output_height; i > 1; i-=2) {

105 // load the 2 strides of source	112 // load the 2 strides of source

106 srcReg32b1 = _mm256_castsi128_si256(	113 srcReg32b1 = _mm256_castsi128_si256(

107 _mm_loadu_si128((__m128i *)(src_ptr-3)));	114 _mm_loadu_si128((const __m128i *)(src_ptr - 3)));

108 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,	115 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,

109 _mm_loadu_si128((__m128i *)	116 _mm_loadu_si128((const __m128i *)

110 (src_ptr+src_pixels_per_line-3)), 1);	117 (src_ptr+src_pixels_per_line-3)), 1);

111	118

112 // filter the source buffer	119 // filter the source buffer

113 srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);	120 srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);

114 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);	121 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);

115	122

116 // multiply 2 adjacent elements with the filter and add the result	123 // multiply 2 adjacent elements with the filter and add the result

117 srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);	124 srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);

118 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);	125 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);

119	126

120 // add and saturate the results together	127 // add and saturate the results together

121 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);	128 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);

122	129

123 // filter the source buffer	130 // filter the source buffer

124 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);	131 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);

125 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);	132 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);

126	133

127 // multiply 2 adjacent elements with the filter and add the result	134 // multiply 2 adjacent elements with the filter and add the result

128 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);	135 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);

129 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);	136 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);

130	137

131 // add and saturate the results together	138 // add and saturate the results together

132 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,	139 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,

133 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));	140 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));

134	141

135 // reading 2 strides of the next 16 bytes	142 // reading 2 strides of the next 16 bytes

136 // (part of it was being read by earlier read)	143 // (part of it was being read by earlier read)

137 srcReg32b2 = _mm256_castsi128_si256(	144 srcReg32b2 = _mm256_castsi128_si256(

138 _mm_loadu_si128((__m128i *)(src_ptr+5)));	145 _mm_loadu_si128((const __m128i *)(src_ptr + 5)));

139 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,	146 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,

140 _mm_loadu_si128((__m128i *)	147 _mm_loadu_si128((const __m128i *)

141 (src_ptr+src_pixels_per_line+5)), 1);	148 (src_ptr+src_pixels_per_line+5)), 1);

142	149

143 // add and saturate the results together	150 // add and saturate the results together

144 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,	151 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,

145 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));	152 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));

146	153

147 // filter the source buffer	154 // filter the source buffer

148 srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);	155 srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);

149 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);	156 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);

150	157

(...skipping 44 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
195 _mm256_extractf128_si256(srcRegFilt32b1_1, 1));	202 _mm256_extractf128_si256(srcRegFilt32b1_1, 1));

196 output_ptr+=dst_stride;	203 output_ptr+=dst_stride;

197 }	204 }

198	205

199 // if the number of strides is odd.	206 // if the number of strides is odd.

200 // process only 16 bytes	207 // process only 16 bytes

201 if (i > 0) {	208 if (i > 0) {

202 __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;	209 __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;

203 __m128i srcRegFilt2, srcRegFilt3;	210 __m128i srcRegFilt2, srcRegFilt3;

204	211

205 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));	212 srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));

206	213

207 // filter the source buffer	214 // filter the source buffer

208 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,	215 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,

209 _mm256_castsi256_si128(filt1Reg));	216 _mm256_castsi256_si128(filt1Reg));

210 srcRegFilt2 = _mm_shuffle_epi8(srcReg1,	217 srcRegFilt2 = _mm_shuffle_epi8(srcReg1,

211 _mm256_castsi256_si128(filt4Reg));	218 _mm256_castsi256_si128(filt4Reg));

212	219

213 // multiply 2 adjacent elements with the filter and add the result	220 // multiply 2 adjacent elements with the filter and add the result

214 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,	221 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,

215 _mm256_castsi256_si128(firstFilters));	222 _mm256_castsi256_si128(firstFilters));

(...skipping 14 matching lines...) Expand all Loading...
230 _mm256_castsi256_si128(secondFilters));	237 _mm256_castsi256_si128(secondFilters));

231 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,	238 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,

232 _mm256_castsi256_si128(thirdFilters));	239 _mm256_castsi256_si128(thirdFilters));

233	240

234 // add and saturate the results together	241 // add and saturate the results together

235 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,	242 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,

236 _mm_min_epi16(srcRegFilt3, srcRegFilt2));	243 _mm_min_epi16(srcRegFilt3, srcRegFilt2));

237	244

238 // reading the next 16 bytes	245 // reading the next 16 bytes

239 // (part of it was being read by earlier read)	246 // (part of it was being read by earlier read)

240 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));	247 srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));

241	248

242 // add and saturate the results together	249 // add and saturate the results together

243 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,	250 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,

244 _mm_max_epi16(srcRegFilt3, srcRegFilt2));	251 _mm_max_epi16(srcRegFilt3, srcRegFilt2));

245	252

246 // filter the source buffer	253 // filter the source buffer

247 srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,	254 srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,

248 _mm256_castsi256_si128(filt1Reg));	255 _mm256_castsi256_si128(filt1Reg));

249 srcRegFilt2 = _mm_shuffle_epi8(srcReg2,	256 srcRegFilt2 = _mm_shuffle_epi8(srcReg2,

250 _mm256_castsi256_si128(filt4Reg));	257 _mm256_castsi256_si128(filt4Reg));

(...skipping 39 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
290 // shrink to 8 bit each 16 bits, the first lane contain the first	297 // shrink to 8 bit each 16 bits, the first lane contain the first

291 // convolve result and the second lane contain the second convolve	298 // convolve result and the second lane contain the second convolve

292 // result	299 // result

293 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);	300 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);

294	301

295 // save 16 bytes	302 // save 16 bytes

296 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);	303 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);

297 }	304 }

298 }	305 }

299	306

300 void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr,	307 static void vp9_filter_block1d16_v8_avx2(const uint8_t *src_ptr,

301 unsigned int src_pitch,	308 ptrdiff_t src_pitch,

302 unsigned char *output_ptr,	309 uint8_t *output_ptr,

303 unsigned int out_pitch,	310 ptrdiff_t out_pitch,

304 unsigned int output_height,	311 uint32_t output_height,

305 int16_t *filter) {	312 const int16_t *filter) {

306 __m128i filtersReg;	313 __m128i filtersReg;

307 __m256i addFilterReg64;	314 __m256i addFilterReg64;

308 __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;	315 __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;

309 __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;	316 __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;

310 __m256i srcReg32b11, srcReg32b12, filtersReg32;	317 __m256i srcReg32b11, srcReg32b12, filtersReg32;

311 __m256i firstFilters, secondFilters, thirdFilters, forthFilters;	318 __m256i firstFilters, secondFilters, thirdFilters, forthFilters;

312 unsigned int i;	319 unsigned int i;

313 unsigned int src_stride, dst_stride;	320 ptrdiff_t src_stride, dst_stride;

314	321

315 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64	322 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64

316 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);	323 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);

317 filtersReg = _mm_loadu_si128((__m128i *)filter);	324 filtersReg = _mm_loadu_si128((const __m128i *)filter);

318 // converting the 16 bit (short) to 8 bit (byte) and have the	325 // converting the 16 bit (short) to 8 bit (byte) and have the

319 // same data in both lanes of 128 bit register.	326 // same data in both lanes of 128 bit register.

320 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);	327 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);

321 // have the same data in both lanes of a 256 bit register	328 // have the same data in both lanes of a 256 bit register

322 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);	329 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);

323	330

324 // duplicate only the first 16 bits (first and second byte)	331 // duplicate only the first 16 bits (first and second byte)

325 // across 256 bit register	332 // across 256 bit register

326 firstFilters = _mm256_shuffle_epi8(filtersReg32,	333 firstFilters = _mm256_shuffle_epi8(filtersReg32,

327 _mm256_set1_epi16(0x100u));	334 _mm256_set1_epi16(0x100u));

328 // duplicate only the second 16 bits (third and forth byte)	335 // duplicate only the second 16 bits (third and forth byte)

329 // across 256 bit register	336 // across 256 bit register

330 secondFilters = _mm256_shuffle_epi8(filtersReg32,	337 secondFilters = _mm256_shuffle_epi8(filtersReg32,

331 _mm256_set1_epi16(0x302u));	338 _mm256_set1_epi16(0x302u));

332 // duplicate only the third 16 bits (fifth and sixth byte)	339 // duplicate only the third 16 bits (fifth and sixth byte)

333 // across 256 bit register	340 // across 256 bit register

334 thirdFilters = _mm256_shuffle_epi8(filtersReg32,	341 thirdFilters = _mm256_shuffle_epi8(filtersReg32,

335 _mm256_set1_epi16(0x504u));	342 _mm256_set1_epi16(0x504u));

336 // duplicate only the forth 16 bits (seventh and eighth byte)	343 // duplicate only the forth 16 bits (seventh and eighth byte)

337 // across 256 bit register	344 // across 256 bit register

338 forthFilters = _mm256_shuffle_epi8(filtersReg32,	345 forthFilters = _mm256_shuffle_epi8(filtersReg32,

339 _mm256_set1_epi16(0x706u));	346 _mm256_set1_epi16(0x706u));

340	347

341 // multiple the size of the source and destination stride by two	348 // multiple the size of the source and destination stride by two

342 src_stride = src_pitch << 1;	349 src_stride = src_pitch << 1;

343 dst_stride = out_pitch << 1;	350 dst_stride = out_pitch << 1;

344	351

345 // load 16 bytes 7 times in stride of src_pitch	352 // load 16 bytes 7 times in stride of src_pitch

346 srcReg32b1 = _mm256_castsi128_si256(	353 srcReg32b1 = _mm256_castsi128_si256(

347 _mm_loadu_si128((__m128i *)(src_ptr)));	354 _mm_loadu_si128((const __m128i *)(src_ptr)));

348 srcReg32b2 = _mm256_castsi128_si256(	355 srcReg32b2 = _mm256_castsi128_si256(

349 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch)));	356 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));

350 srcReg32b3 = _mm256_castsi128_si256(	357 srcReg32b3 = _mm256_castsi128_si256(

351 _mm_loadu_si128((__m128i )(src_ptr+src_pitch2)));	358 _mm_loadu_si128((const __m128i )(src_ptr + src_pitch 2)));

352 srcReg32b4 = _mm256_castsi128_si256(	359 srcReg32b4 = _mm256_castsi128_si256(

353 _mm_loadu_si128((__m128i )(src_ptr+src_pitch3)));	360 _mm_loadu_si128((const __m128i )(src_ptr + src_pitch 3)));

354 srcReg32b5 = _mm256_castsi128_si256(	361 srcReg32b5 = _mm256_castsi128_si256(

355 _mm_loadu_si128((__m128i )(src_ptr+src_pitch4)));	362 _mm_loadu_si128((const __m128i )(src_ptr + src_pitch 4)));

356 srcReg32b6 = _mm256_castsi128_si256(	363 srcReg32b6 = _mm256_castsi128_si256(

357 _mm_loadu_si128((__m128i )(src_ptr+src_pitch5)));	364 _mm_loadu_si128((const __m128i )(src_ptr + src_pitch 5)));

358 srcReg32b7 = _mm256_castsi128_si256(	365 srcReg32b7 = _mm256_castsi128_si256(

359 _mm_loadu_si128((__m128i )(src_ptr+src_pitch6)));	366 _mm_loadu_si128((const __m128i )(src_ptr + src_pitch 6)));

360	367

361 // have each consecutive loads on the same 256 register	368 // have each consecutive loads on the same 256 register

362 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,	369 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,

363 _mm256_castsi256_si128(srcReg32b2), 1);	370 _mm256_castsi256_si128(srcReg32b2), 1);

364 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,	371 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,

365 _mm256_castsi256_si128(srcReg32b3), 1);	372 _mm256_castsi256_si128(srcReg32b3), 1);

366 srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,	373 srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,

367 _mm256_castsi256_si128(srcReg32b4), 1);	374 _mm256_castsi256_si128(srcReg32b4), 1);

368 srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,	375 srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,

369 _mm256_castsi256_si128(srcReg32b5), 1);	376 _mm256_castsi256_si128(srcReg32b5), 1);

(...skipping 16 matching lines...) Expand all Loading...
386 srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);	393 srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);

387	394

388 // save	395 // save

389 srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);	396 srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);

390	397

391	398

392 for (i = output_height; i > 1; i-=2) {	399 for (i = output_height; i > 1; i-=2) {

393 // load the last 2 loads of 16 bytes and have every two	400 // load the last 2 loads of 16 bytes and have every two

394 // consecutive loads in the same 256 bit register	401 // consecutive loads in the same 256 bit register

395 srcReg32b8 = _mm256_castsi128_si256(	402 srcReg32b8 = _mm256_castsi128_si256(

396 _mm_loadu_si128((__m128i )(src_ptr+src_pitch7)));	403 _mm_loadu_si128((const __m128i )(src_ptr + src_pitch 7)));

397 srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,	404 srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,

398 _mm256_castsi256_si128(srcReg32b8), 1);	405 _mm256_castsi256_si128(srcReg32b8), 1);

399 srcReg32b9 = _mm256_castsi128_si256(	406 srcReg32b9 = _mm256_castsi128_si256(

400 _mm_loadu_si128((__m128i )(src_ptr+src_pitch8)));	407 _mm_loadu_si128((const __m128i )(src_ptr + src_pitch 8)));

401 srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,	408 srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,

402 _mm256_castsi256_si128(srcReg32b9), 1);	409 _mm256_castsi256_si128(srcReg32b9), 1);

403	410

404 // merge every two consecutive registers	411 // merge every two consecutive registers

405 // save	412 // save

406 srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);	413 srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);

407 srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);	414 srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);

408	415

409 // multiply 2 adjacent elements with the filter and add the result	416 // multiply 2 adjacent elements with the filter and add the result

410 srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);	417 srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);

(...skipping 58 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
469 srcReg32b11 = srcReg32b2;	476 srcReg32b11 = srcReg32b2;

470 srcReg32b3 = srcReg32b5;	477 srcReg32b3 = srcReg32b5;

471 srcReg32b2 = srcReg32b4;	478 srcReg32b2 = srcReg32b4;

472 srcReg32b5 = srcReg32b7;	479 srcReg32b5 = srcReg32b7;

473 srcReg32b7 = srcReg32b9;	480 srcReg32b7 = srcReg32b9;

474 }	481 }

475 if (i > 0) {	482 if (i > 0) {

476 __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;	483 __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;

477 __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;	484 __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;

478 // load the last 16 bytes	485 // load the last 16 bytes

479 srcRegFilt8 = _mm_loadu_si128((__m128i )(src_ptr+src_pitch7));	486 srcRegFilt8 = _mm_loadu_si128((const __m128i )(src_ptr + src_pitch 7));

480	487

481 // merge the last 2 results together	488 // merge the last 2 results together

482 srcRegFilt4 = _mm_unpacklo_epi8(	489 srcRegFilt4 = _mm_unpacklo_epi8(

483 _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);	490 _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);

484 srcRegFilt7 = _mm_unpackhi_epi8(	491 srcRegFilt7 = _mm_unpackhi_epi8(

485 _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);	492 _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);

486	493

487 // multiply 2 adjacent elements with the filter and add the result	494 // multiply 2 adjacent elements with the filter and add the result

488 srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),	495 srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),

489 _mm256_castsi256_si128(firstFilters));	496 _mm256_castsi256_si128(firstFilters));

(...skipping 45 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
535	542

536 // shrink to 8 bit each 16 bits, the first lane contain the first	543 // shrink to 8 bit each 16 bits, the first lane contain the first

537 // convolve result and the second lane contain the second convolve	544 // convolve result and the second lane contain the second convolve

538 // result	545 // result

539 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);	546 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);

540	547

541 // save 16 bytes	548 // save 16 bytes

542 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);	549 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);

543 }	550 }

544 }	551 }

	552

	553 #if HAVE_AVX2 && HAVE_SSSE3

	554 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;

	555 #if ARCH_X86_64

	556 filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;

	557 filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;

	558 filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;

	559 #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3

	560 #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3

	561 #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3

	562 #else // ARCH_X86

	563 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;

	564 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;

	565 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;

	566 #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3

	567 #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3

	568 #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3

	569 #endif // ARCH_X86_64

	570 filter8_1dfunction vp9_filter_block1d16_v2_ssse3;

	571 filter8_1dfunction vp9_filter_block1d16_h2_ssse3;

	572 filter8_1dfunction vp9_filter_block1d8_v2_ssse3;

	573 filter8_1dfunction vp9_filter_block1d8_h2_ssse3;

	574 filter8_1dfunction vp9_filter_block1d4_v2_ssse3;

	575 filter8_1dfunction vp9_filter_block1d4_h2_ssse3;

	576 #define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3

	577 #define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3

	578 #define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3

	579 #define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3

	580 #define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3

	581 #define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3

	582 #define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3

	583 // void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,

	584 // uint8_t *dst, ptrdiff_t dst_stride,

	585 // const int16_t *filter_x, int x_step_q4,

	586 // const int16_t *filter_y, int y_step_q4,

	587 // int w, int h);

	588 // void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,

	589 // uint8_t *dst, ptrdiff_t dst_stride,

	590 // const int16_t *filter_x, int x_step_q4,

	591 // const int16_t *filter_y, int y_step_q4,

	592 // int w, int h);

	593 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);

	594 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);

	595

	596 // void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,

	597 // uint8_t *dst, ptrdiff_t dst_stride,

	598 // const int16_t *filter_x, int x_step_q4,

	599 // const int16_t *filter_y, int y_step_q4,

	600 // int w, int h);

	601 FUN_CONV_2D(, avx2);

	602 #endif // HAVE_AX2 && HAVE_SSSE3

OLD	NEW