source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c - Issue 1162573005: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c

Issue 1162573005: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

	11 // Due to a header conflict between math.h and intrinsics includes with ceil()

	12 // in certain configurations under vs9 this include needs to precede

	13 // tmmintrin.h.

	14 #include "./vp9_rtcd.h"

	15

11 #include <tmmintrin.h>	16 #include <tmmintrin.h>

	17

	18 #include "vp9/common/x86/convolve.h"

12 #include "vpx_ports/mem.h"	19 #include "vpx_ports/mem.h"

13 #include "vpx_ports/emmintrin_compat.h"	20 #include "vpx_ports/emmintrin_compat.h"

14	21

15 // filters only for the 4_h8 convolution	22 // filters only for the 4_h8 convolution

16 DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {	23 DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {

17 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6	24 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6

18 };	25 };

19	26

20 DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {	27 DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {

21 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10	28 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10

22 };	29 };

23	30

24 // filters for 8_h8 and 16_h8	31 // filters for 8_h8 and 16_h8

25 DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {	32 DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {

26 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8	33 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8

27 };	34 };

28	35

29 DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {	36 DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {

30 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10	37 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10

31 };	38 };

32	39

33 DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {	40 DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {

34 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12	41 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12

35 };	42 };

36	43

37 DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {	44 DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {

38 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14	45 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14

39 };	46 };

40	47

41 void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,	48 // These are reused by the avx2 intrinsics.

42 unsigned int src_pixels_per_line,	49 filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;

43 unsigned char *output_ptr,	50 filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;

44 unsigned int output_pitch,	51 filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;

45 unsigned int output_height,	52

46 int16_t *filter) {	53 void vp9_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr,

	54 ptrdiff_t src_pixels_per_line,

	55 uint8_t *output_ptr,

	56 ptrdiff_t output_pitch,

	57 uint32_t output_height,

	58 const int16_t *filter) {

47 __m128i firstFilters, secondFilters, shuffle1, shuffle2;	59 __m128i firstFilters, secondFilters, shuffle1, shuffle2;

48 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;	60 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;

49 __m128i addFilterReg64, filtersReg, srcReg, minReg;	61 __m128i addFilterReg64, filtersReg, srcReg, minReg;

50 unsigned int i;	62 unsigned int i;

51	63

52 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64	64 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64

53 addFilterReg64 =_mm_set1_epi32((int)0x0400040u);	65 addFilterReg64 =_mm_set1_epi32((int)0x0400040u);

54 filtersReg = _mm_loadu_si128((__m128i *)filter);	66 filtersReg = _mm_loadu_si128((const __m128i *)filter);

55 // converting the 16 bit (short) to 8 bit (byte) and have the same data	67 // converting the 16 bit (short) to 8 bit (byte) and have the same data

56 // in both lanes of 128 bit register.	68 // in both lanes of 128 bit register.

57 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);	69 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);

58	70

59 // duplicate only the first 16 bits in the filter into the first lane	71 // duplicate only the first 16 bits in the filter into the first lane

60 firstFilters = _mm_shufflelo_epi16(filtersReg, 0);	72 firstFilters = _mm_shufflelo_epi16(filtersReg, 0);

61 // duplicate only the third 16 bit in the filter into the first lane	73 // duplicate only the third 16 bit in the filter into the first lane

62 secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);	74 secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);

63 // duplicate only the seconds 16 bits in the filter into the second lane	75 // duplicate only the seconds 16 bits in the filter into the second lane

64 // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3	76 // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3

65 firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);	77 firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);

66 // duplicate only the forth 16 bits in the filter into the second lane	78 // duplicate only the forth 16 bits in the filter into the second lane

67 // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7	79 // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7

68 secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);	80 secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);

69	81

70 // loading the local filters	82 // loading the local filters

71 shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8);	83 shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8);

72 shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);	84 shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);

73	85

74 for (i = 0; i < output_height; i++) {	86 for (i = 0; i < output_height; i++) {

75 srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));	87 srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));

76	88

77 // filter the source buffer	89 // filter the source buffer

78 srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1);	90 srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1);

79 srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2);	91 srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2);

80	92

81 // multiply 2 adjacent elements with the filter and add the result	93 // multiply 2 adjacent elements with the filter and add the result

82 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);	94 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);

83 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);	95 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);

84	96

85 // extract the higher half of the lane	97 // extract the higher half of the lane

(...skipping 16 matching lines...) Expand all Loading...
102 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);	114 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);

103 src_ptr+=src_pixels_per_line;	115 src_ptr+=src_pixels_per_line;

104	116

105 // save only 4 bytes	117 // save only 4 bytes

106 ((int)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);	118 ((int)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);

107	119

108 output_ptr+=output_pitch;	120 output_ptr+=output_pitch;

109 }	121 }

110 }	122 }

111	123

112 void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,	124 void vp9_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,

113 unsigned int src_pixels_per_line,	125 ptrdiff_t src_pixels_per_line,

114 unsigned char *output_ptr,	126 uint8_t *output_ptr,

115 unsigned int output_pitch,	127 ptrdiff_t output_pitch,

116 unsigned int output_height,	128 uint32_t output_height,

117 int16_t *filter) {	129 const int16_t *filter) {

118 __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;	130 __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;

119 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;	131 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;

120 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;	132 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;

121 __m128i addFilterReg64, filtersReg, minReg;	133 __m128i addFilterReg64, filtersReg, minReg;

122 unsigned int i;	134 unsigned int i;

123	135

124 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64	136 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64

125 addFilterReg64 = _mm_set1_epi32((int)0x0400040u);	137 addFilterReg64 = _mm_set1_epi32((int)0x0400040u);

126 filtersReg = _mm_loadu_si128((__m128i *)filter);	138 filtersReg = _mm_loadu_si128((const __m128i *)filter);

127 // converting the 16 bit (short) to 8 bit (byte) and have the same data	139 // converting the 16 bit (short) to 8 bit (byte) and have the same data

128 // in both lanes of 128 bit register.	140 // in both lanes of 128 bit register.

129 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);	141 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);

130	142

131 // duplicate only the first 16 bits (first and second byte)	143 // duplicate only the first 16 bits (first and second byte)

132 // across 128 bit register	144 // across 128 bit register

133 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));	145 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));

134 // duplicate only the second 16 bits (third and forth byte)	146 // duplicate only the second 16 bits (third and forth byte)

135 // across 128 bit register	147 // across 128 bit register

136 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));	148 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));

137 // duplicate only the third 16 bits (fifth and sixth byte)	149 // duplicate only the third 16 bits (fifth and sixth byte)

138 // across 128 bit register	150 // across 128 bit register

139 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));	151 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));

140 // duplicate only the forth 16 bits (seventh and eighth byte)	152 // duplicate only the forth 16 bits (seventh and eighth byte)

141 // across 128 bit register	153 // across 128 bit register

142 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));	154 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));

143	155

144 filt1Reg = _mm_load_si128((__m128i const *)filt1_global);	156 filt1Reg = _mm_load_si128((__m128i const *)filt1_global);

145 filt2Reg = _mm_load_si128((__m128i const *)filt2_global);	157 filt2Reg = _mm_load_si128((__m128i const *)filt2_global);

146 filt3Reg = _mm_load_si128((__m128i const *)filt3_global);	158 filt3Reg = _mm_load_si128((__m128i const *)filt3_global);

147 filt4Reg = _mm_load_si128((__m128i const *)filt4_global);	159 filt4Reg = _mm_load_si128((__m128i const *)filt4_global);

148	160

149 for (i = 0; i < output_height; i++) {	161 for (i = 0; i < output_height; i++) {

150 srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));	162 srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));

151	163

152 // filter the source buffer	164 // filter the source buffer

153 srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);	165 srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);

154 srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);	166 srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);

155	167

156 // multiply 2 adjacent elements with the filter and add the result	168 // multiply 2 adjacent elements with the filter and add the result

157 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);	169 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);

158 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);	170 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);

159	171

160 // filter the source buffer	172 // filter the source buffer

(...skipping 21 matching lines...) Expand all Loading...
182	194

183 src_ptr+=src_pixels_per_line;	195 src_ptr+=src_pixels_per_line;

184	196

185 // save only 8 bytes	197 // save only 8 bytes

186 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);	198 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);

187	199

188 output_ptr+=output_pitch;	200 output_ptr+=output_pitch;

189 }	201 }

190 }	202 }

191	203

192 void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,	204 static void vp9_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr,

193 unsigned int src_pixels_per_line,	205 ptrdiff_t src_pixels_per_line,

194 unsigned char *output_ptr,	206 uint8_t *output_ptr,

195 unsigned int output_pitch,	207 ptrdiff_t output_pitch,

196 unsigned int output_height,	208 uint32_t output_height,

197 int16_t *filter) {	209 const int16_t *filter) {

198 __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;	210 __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;

199 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;	211 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;

200 __m128i firstFilters, secondFilters, thirdFilters, forthFilters;	212 __m128i firstFilters, secondFilters, thirdFilters, forthFilters;

201 __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;	213 __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;

202 unsigned int i;	214 unsigned int i;

203	215

204 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64	216 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64

205 addFilterReg64 = _mm_set1_epi32((int)0x0400040u);	217 addFilterReg64 = _mm_set1_epi32((int)0x0400040u);

206 filtersReg = _mm_loadu_si128((__m128i *)filter);	218 filtersReg = _mm_loadu_si128((const __m128i *)filter);

207 // converting the 16 bit (short) to 8 bit (byte) and have the same data	219 // converting the 16 bit (short) to 8 bit (byte) and have the same data

208 // in both lanes of 128 bit register.	220 // in both lanes of 128 bit register.

209 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);	221 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);

210	222

211 // duplicate only the first 16 bits (first and second byte)	223 // duplicate only the first 16 bits (first and second byte)

212 // across 128 bit register	224 // across 128 bit register

213 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));	225 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));

214 // duplicate only the second 16 bits (third and forth byte)	226 // duplicate only the second 16 bits (third and forth byte)

215 // across 128 bit register	227 // across 128 bit register

216 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));	228 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));

217 // duplicate only the third 16 bits (fifth and sixth byte)	229 // duplicate only the third 16 bits (fifth and sixth byte)

218 // across 128 bit register	230 // across 128 bit register

219 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));	231 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));

220 // duplicate only the forth 16 bits (seventh and eighth byte)	232 // duplicate only the forth 16 bits (seventh and eighth byte)

221 // across 128 bit register	233 // across 128 bit register

222 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));	234 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));

223	235

224 filt1Reg = _mm_load_si128((__m128i const *)filt1_global);	236 filt1Reg = _mm_load_si128((__m128i const *)filt1_global);

225 filt2Reg = _mm_load_si128((__m128i const *)filt2_global);	237 filt2Reg = _mm_load_si128((__m128i const *)filt2_global);

226 filt3Reg = _mm_load_si128((__m128i const *)filt3_global);	238 filt3Reg = _mm_load_si128((__m128i const *)filt3_global);

227 filt4Reg = _mm_load_si128((__m128i const *)filt4_global);	239 filt4Reg = _mm_load_si128((__m128i const *)filt4_global);

228	240

229 for (i = 0; i < output_height; i++) {	241 for (i = 0; i < output_height; i++) {

230 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));	242 srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));

231	243

232 // filter the source buffer	244 // filter the source buffer

233 srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);	245 srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);

234 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg);	246 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg);

235	247

236 // multiply 2 adjacent elements with the filter and add the result	248 // multiply 2 adjacent elements with the filter and add the result

237 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);	249 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);

238 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);	250 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);

239	251

240 // add and saturate the results together	252 // add and saturate the results together

241 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);	253 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);

242	254

243 // filter the source buffer	255 // filter the source buffer

244 srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg);	256 srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg);

245 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);	257 srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);

246	258

247 // multiply 2 adjacent elements with the filter and add the result	259 // multiply 2 adjacent elements with the filter and add the result

248 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);	260 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);

249 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);	261 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);

250	262

251 // add and saturate the results together	263 // add and saturate the results together

252 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,	264 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,

253 _mm_min_epi16(srcRegFilt3, srcRegFilt2));	265 _mm_min_epi16(srcRegFilt3, srcRegFilt2));

254	266

255 // reading the next 16 bytes.	267 // reading the next 16 bytes.

256 // (part of it was being read by earlier read)	268 // (part of it was being read by earlier read)

257 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));	269 srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));

258	270

259 // add and saturate the results together	271 // add and saturate the results together

260 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,	272 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,

261 _mm_max_epi16(srcRegFilt3, srcRegFilt2));	273 _mm_max_epi16(srcRegFilt3, srcRegFilt2));

262	274

263 // filter the source buffer	275 // filter the source buffer

264 srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);	276 srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);

265 srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg);	277 srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg);

266	278

267 // multiply 2 adjacent elements with the filter and add the result	279 // multiply 2 adjacent elements with the filter and add the result

(...skipping 31 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
299	311

300 src_ptr+=src_pixels_per_line;	312 src_ptr+=src_pixels_per_line;

301	313

302 // save 16 bytes	314 // save 16 bytes

303 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);	315 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);

304	316

305 output_ptr+=output_pitch;	317 output_ptr+=output_pitch;

306 }	318 }

307 }	319 }

308	320

309 void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,	321 void vp9_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,

310 unsigned int src_pitch,	322 ptrdiff_t src_pitch,

311 unsigned char *output_ptr,	323 uint8_t *output_ptr,

312 unsigned int out_pitch,	324 ptrdiff_t out_pitch,

313 unsigned int output_height,	325 uint32_t output_height,

314 int16_t *filter) {	326 const int16_t *filter) {

315 __m128i addFilterReg64, filtersReg, minReg;	327 __m128i addFilterReg64, filtersReg, minReg;

316 __m128i firstFilters, secondFilters, thirdFilters, forthFilters;	328 __m128i firstFilters, secondFilters, thirdFilters, forthFilters;

317 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;	329 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;

318 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;	330 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;

319 __m128i srcReg8;	331 __m128i srcReg8;

320 unsigned int i;	332 unsigned int i;

321	333

322 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64	334 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64

323 addFilterReg64 = _mm_set1_epi32((int)0x0400040u);	335 addFilterReg64 = _mm_set1_epi32((int)0x0400040u);

324 filtersReg = _mm_loadu_si128((__m128i *)filter);	336 filtersReg = _mm_loadu_si128((const __m128i *)filter);

325 // converting the 16 bit (short) to 8 bit (byte) and have the same data	337 // converting the 16 bit (short) to 8 bit (byte) and have the same data

326 // in both lanes of 128 bit register.	338 // in both lanes of 128 bit register.

327 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);	339 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);

328	340

329 // duplicate only the first 16 bits in the filter	341 // duplicate only the first 16 bits in the filter

330 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));	342 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));

331 // duplicate only the second 16 bits in the filter	343 // duplicate only the second 16 bits in the filter

332 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));	344 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));

333 // duplicate only the third 16 bits in the filter	345 // duplicate only the third 16 bits in the filter

334 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));	346 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));

335 // duplicate only the forth 16 bits in the filter	347 // duplicate only the forth 16 bits in the filter

336 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));	348 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));

337	349

338 // load the first 7 rows of 8 bytes	350 // load the first 7 rows of 8 bytes

339 srcReg1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);	351 srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);

340 srcReg2 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch)[0]);	352 srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));

341 srcReg3 = _mm_loadl_epi64((__m128i )&(src_ptr + src_pitch 2)[0]);	353 srcReg3 = _mm_loadl_epi64((const __m128i )(src_ptr + src_pitch 2));

342 srcReg4 = _mm_loadl_epi64((__m128i )&(src_ptr + src_pitch 3)[0]);	354 srcReg4 = _mm_loadl_epi64((const __m128i )(src_ptr + src_pitch 3));

343 srcReg5 = _mm_loadl_epi64((__m128i )&(src_ptr + src_pitch 4)[0]);	355 srcReg5 = _mm_loadl_epi64((const __m128i )(src_ptr + src_pitch 4));

344 srcReg6 = _mm_loadl_epi64((__m128i )&(src_ptr + src_pitch 5)[0]);	356 srcReg6 = _mm_loadl_epi64((const __m128i )(src_ptr + src_pitch 5));

345 srcReg7 = _mm_loadl_epi64((__m128i )&(src_ptr + src_pitch 6)[0]);	357 srcReg7 = _mm_loadl_epi64((const __m128i )(src_ptr + src_pitch 6));

346	358

347 for (i = 0; i < output_height; i++) {	359 for (i = 0; i < output_height; i++) {

348 // load the last 8 bytes	360 // load the last 8 bytes

349 srcReg8 = _mm_loadl_epi64((__m128i )&(src_ptr + src_pitch 7)[0]);	361 srcReg8 = _mm_loadl_epi64((const __m128i )(src_ptr + src_pitch 7));

350	362

351 // merge the result together	363 // merge the result together

352 srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);	364 srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);

353 srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);	365 srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);

354	366

355 // merge the result together	367 // merge the result together

356 srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);	368 srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);

357 srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);	369 srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);

358	370

359 // multiply 2 adjacent elements with the filter and add the result	371 // multiply 2 adjacent elements with the filter and add the result

(...skipping 27 matching lines...) Expand all Loading...
387 srcReg6 = srcReg7;	399 srcReg6 = srcReg7;

388 srcReg7 = srcReg8;	400 srcReg7 = srcReg8;

389	401

390 // save only 8 bytes convolve result	402 // save only 8 bytes convolve result

391 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);	403 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);

392	404

393 output_ptr+=out_pitch;	405 output_ptr+=out_pitch;

394 }	406 }

395 }	407 }

396	408

397 void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,	409 static void vp9_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr,

398 unsigned int src_pitch,	410 ptrdiff_t src_pitch,

399 unsigned char *output_ptr,	411 uint8_t *output_ptr,

400 unsigned int out_pitch,	412 ptrdiff_t out_pitch,

401 unsigned int output_height,	413 uint32_t output_height,

402 int16_t *filter) {	414 const int16_t *filter) {

403 __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3;	415 __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3;

404 __m128i firstFilters, secondFilters, thirdFilters, forthFilters;	416 __m128i firstFilters, secondFilters, thirdFilters, forthFilters;

405 __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;	417 __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;

406 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;	418 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;

407 __m128i srcReg8;	419 __m128i srcReg8;

408 unsigned int i;	420 unsigned int i;

409	421

410 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64	422 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64

411 addFilterReg64 = _mm_set1_epi32((int)0x0400040u);	423 addFilterReg64 = _mm_set1_epi32((int)0x0400040u);

412 filtersReg = _mm_loadu_si128((__m128i *)filter);	424 filtersReg = _mm_loadu_si128((const __m128i *)filter);

413 // converting the 16 bit (short) to 8 bit (byte) and have the same data	425 // converting the 16 bit (short) to 8 bit (byte) and have the same data

414 // in both lanes of 128 bit register.	426 // in both lanes of 128 bit register.

415 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);	427 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);

416	428

417 // duplicate only the first 16 bits in the filter	429 // duplicate only the first 16 bits in the filter

418 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));	430 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));

419 // duplicate only the second 16 bits in the filter	431 // duplicate only the second 16 bits in the filter

420 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));	432 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));

421 // duplicate only the third 16 bits in the filter	433 // duplicate only the third 16 bits in the filter

422 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));	434 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));

423 // duplicate only the forth 16 bits in the filter	435 // duplicate only the forth 16 bits in the filter

424 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));	436 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));

425	437

426 // load the first 7 rows of 16 bytes	438 // load the first 7 rows of 16 bytes

427 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr));	439 srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));

428 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch));	440 srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));

429 srcReg3 = _mm_loadu_si128((__m128i )(src_ptr + src_pitch 2));	441 srcReg3 = _mm_loadu_si128((const __m128i )(src_ptr + src_pitch 2));

430 srcReg4 = _mm_loadu_si128((__m128i )(src_ptr + src_pitch 3));	442 srcReg4 = _mm_loadu_si128((const __m128i )(src_ptr + src_pitch 3));

431 srcReg5 = _mm_loadu_si128((__m128i )(src_ptr + src_pitch 4));	443 srcReg5 = _mm_loadu_si128((const __m128i )(src_ptr + src_pitch 4));

432 srcReg6 = _mm_loadu_si128((__m128i )(src_ptr + src_pitch 5));	444 srcReg6 = _mm_loadu_si128((const __m128i )(src_ptr + src_pitch 5));

433 srcReg7 = _mm_loadu_si128((__m128i )(src_ptr + src_pitch 6));	445 srcReg7 = _mm_loadu_si128((const __m128i )(src_ptr + src_pitch 6));

434	446

435 for (i = 0; i < output_height; i++) {	447 for (i = 0; i < output_height; i++) {

436 // load the last 16 bytes	448 // load the last 16 bytes

437 srcReg8 = _mm_loadu_si128((__m128i )(src_ptr + src_pitch 7));	449 srcReg8 = _mm_loadu_si128((const __m128i )(src_ptr + src_pitch 7));

438	450

439 // merge the result together	451 // merge the result together

440 srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2);	452 srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2);

441 srcRegFilt6 = _mm_unpacklo_epi8(srcReg7, srcReg8);	453 srcRegFilt6 = _mm_unpacklo_epi8(srcReg7, srcReg8);

442 srcRegFilt1 = _mm_unpackhi_epi8(srcReg1, srcReg2);	454 srcRegFilt1 = _mm_unpackhi_epi8(srcReg1, srcReg2);

443 srcRegFilt3 = _mm_unpackhi_epi8(srcReg7, srcReg8);	455 srcRegFilt3 = _mm_unpackhi_epi8(srcReg7, srcReg8);

444	456

445 // multiply 2 adjacent elements with the filter and add the result	457 // multiply 2 adjacent elements with the filter and add the result

446 srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);	458 srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);

447 srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);	459 srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);

(...skipping 53 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
501 srcReg5 = srcReg6;	513 srcReg5 = srcReg6;

502 srcReg6 = srcReg7;	514 srcReg6 = srcReg7;

503 srcReg7 = srcReg8;	515 srcReg7 = srcReg8;

504	516

505 // save 16 bytes convolve result	517 // save 16 bytes convolve result

506 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);	518 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);

507	519

508 output_ptr+=out_pitch;	520 output_ptr+=out_pitch;

509 }	521 }

510 }	522 }

	523

	524 #if ARCH_X86_64

	525 filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;

	526 filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;

	527 filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;

	528 filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;

	529 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;

	530 filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;

	531 #define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3

	532 #define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3

	533 #define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3

	534 #define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3

	535 #define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3

	536 #else // ARCH_X86

	537 filter8_1dfunction vp9_filter_block1d16_v8_ssse3;

	538 filter8_1dfunction vp9_filter_block1d16_h8_ssse3;

	539 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;

	540 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;

	541 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;

	542 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;

	543 #endif // ARCH_X86_64

	544 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;

	545 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;

	546 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;

	547 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;

	548 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;

	549 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;

	550

	551 filter8_1dfunction vp9_filter_block1d16_v2_ssse3;

	552 filter8_1dfunction vp9_filter_block1d16_h2_ssse3;

	553 filter8_1dfunction vp9_filter_block1d8_v2_ssse3;

	554 filter8_1dfunction vp9_filter_block1d8_h2_ssse3;

	555 filter8_1dfunction vp9_filter_block1d4_v2_ssse3;

	556 filter8_1dfunction vp9_filter_block1d4_h2_ssse3;

	557 filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3;

	558 filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3;

	559 filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3;

	560 filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3;

	561 filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3;

	562 filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3;

	563

	564 // void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,

	565 // uint8_t *dst, ptrdiff_t dst_stride,

	566 // const int16_t *filter_x, int x_step_q4,

	567 // const int16_t *filter_y, int y_step_q4,

	568 // int w, int h);

	569 // void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,

	570 // uint8_t *dst, ptrdiff_t dst_stride,

	571 // const int16_t *filter_x, int x_step_q4,

	572 // const int16_t *filter_y, int y_step_q4,

	573 // int w, int h);

	574 // void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,

	575 // uint8_t *dst, ptrdiff_t dst_stride,

	576 // const int16_t *filter_x, int x_step_q4,

	577 // const int16_t *filter_y, int y_step_q4,

	578 // int w, int h);

	579 // void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,

	580 // uint8_t *dst, ptrdiff_t dst_stride,

	581 // const int16_t *filter_x, int x_step_q4,

	582 // const int16_t *filter_y, int y_step_q4,

	583 // int w, int h);

	584 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);

	585 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);

	586 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);

	587 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,

	588 ssse3);

	589

	590 // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,

	591 // uint8_t *dst, ptrdiff_t dst_stride,

	592 // const int16_t *filter_x, int x_step_q4,

	593 // const int16_t *filter_y, int y_step_q4,

	594 // int w, int h);

	595 // void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,

	596 // uint8_t *dst, ptrdiff_t dst_stride,

	597 // const int16_t *filter_x, int x_step_q4,

	598 // const int16_t *filter_y, int y_step_q4,

	599 // int w, int h);

	600 FUN_CONV_2D(, ssse3);

	601 FUN_CONV_2D(avg_ , ssse3);

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c ('k') | source/libvpx/vp9/decoder/vp9_decodeframe.c » ('j') | no next file with comments »