source/libvpx/vp9/common/x86/vp9_asm_stubs.c - Issue 168343002: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_asm_stubs.c

Issue 168343002: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: libvpx: Pull from upstream Created 6 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #include <assert.h>	11 #include <assert.h>

12	12

13 #include "./vpx_config.h"	13 #include "./vpx_config.h"

14 #include "./vp9_rtcd.h"	14 #include "./vp9_rtcd.h"

15 #include "vpx_ports/mem.h"	15 #include "vpx_ports/mem.h"

16	16

17 typedef void filter8_1dfunction (	17 typedef void filter8_1dfunction (

18 const unsigned char *src_ptr,	18 const unsigned char *src_ptr,

19 const unsigned int src_pitch,	19 const ptrdiff_t src_pitch,

20 unsigned char *output_ptr,	20 unsigned char *output_ptr,

21 unsigned int out_pitch,	21 ptrdiff_t out_pitch,

22 unsigned int output_height,	22 unsigned int output_height,

23 const short *filter	23 const short *filter

24 );	24 );

25	25

	26 #define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \

	27 void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \

	28 uint8_t *dst, ptrdiff_t dst_stride, \

	29 const int16_t *filter_x, int x_step_q4, \

	30 const int16_t *filter_y, int y_step_q4, \

	31 int w, int h) { \

	32 if (step_q4 == 16 && filter[3] != 128) { \

	33 if (filter[0] \|\| filter[1] \|\| filter[2]) { \

	34 while (w >= 16) { \

	35 vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \

	36 src_stride, \

	37 dst, \

	38 dst_stride, \

	39 h, \

	40 filter); \

	41 src += 16; \

	42 dst += 16; \

	43 w -= 16; \

	44 } \

	45 while (w >= 8) { \

	46 vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \

	47 src_stride, \

	48 dst, \

	49 dst_stride, \

	50 h, \

	51 filter); \

	52 src += 8; \

	53 dst += 8; \

	54 w -= 8; \

	55 } \

	56 while (w >= 4) { \

	57 vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \

	58 src_stride, \

	59 dst, \

	60 dst_stride, \

	61 h, \

	62 filter); \

	63 src += 4; \

	64 dst += 4; \

	65 w -= 4; \

	66 } \

	67 } else { \

	68 while (w >= 16) { \

	69 vp9_filter_block1d16_##dir##2_##avg##opt(src, \

	70 src_stride, \

	71 dst, \

	72 dst_stride, \

	73 h, \

	74 filter); \

	75 src += 16; \

	76 dst += 16; \

	77 w -= 16; \

	78 } \

	79 while (w >= 8) { \

	80 vp9_filter_block1d8_##dir##2_##avg##opt(src, \

	81 src_stride, \

	82 dst, \

	83 dst_stride, \

	84 h, \

	85 filter); \

	86 src += 8; \

	87 dst += 8; \

	88 w -= 8; \

	89 } \

	90 while (w >= 4) { \

	91 vp9_filter_block1d4_##dir##2_##avg##opt(src, \

	92 src_stride, \

	93 dst, \

	94 dst_stride, \

	95 h, \

	96 filter); \

	97 src += 4; \

	98 dst += 4; \

	99 w -= 4; \

	100 } \

	101 } \

	102 } \

	103 if (w) { \

	104 vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \

	105 filter_x, x_step_q4, filter_y, y_step_q4, \

	106 w, h); \

	107 } \

	108 }

	109

	110 #define FUN_CONV_2D(avg, opt) \

	111 void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \

	112 uint8_t *dst, ptrdiff_t dst_stride, \

	113 const int16_t *filter_x, int x_step_q4, \

	114 const int16_t *filter_y, int y_step_q4, \

	115 int w, int h) { \

	116 assert(w <= 64); \

	117 assert(h <= 64); \

	118 if (x_step_q4 == 16 && y_step_q4 == 16) { \

	119 if (filter_x[0] \|\| filter_x[1] \|\| filter_x[2] \|\| filter_x[3] == 128 \|\| \

	120 filter_y[0] \|\| filter_y[1] \|\| filter_y[2] \|\| filter_y[3] == 128) { \

	121 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); \

	122 vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \

	123 filter_x, x_step_q4, filter_y, y_step_q4, \

	124 w, h + 7); \

	125 vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \

	126 filter_x, x_step_q4, filter_y, \

	127 y_step_q4, w, h); \

	128 } else { \

	129 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 65); \

	130 vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \

	131 filter_x, x_step_q4, filter_y, y_step_q4, \

	132 w, h + 1); \

	133 vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \

	134 filter_x, x_step_q4, filter_y, \

	135 y_step_q4, w, h); \

	136 } \

	137 } else { \

	138 vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \

	139 filter_x, x_step_q4, filter_y, y_step_q4, w, h); \

	140 } \

	141 }

	142 #if HAVE_AVX2

	143 filter8_1dfunction vp9_filter_block1d16_v8_avx2;

	144 filter8_1dfunction vp9_filter_block1d16_h8_avx2;

	145 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;

	146 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;

	147 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;

	148 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;

	149 filter8_1dfunction vp9_filter_block1d16_v2_ssse3;

	150 filter8_1dfunction vp9_filter_block1d16_h2_ssse3;

	151 filter8_1dfunction vp9_filter_block1d8_v2_ssse3;

	152 filter8_1dfunction vp9_filter_block1d8_h2_ssse3;

	153 filter8_1dfunction vp9_filter_block1d4_v2_ssse3;

	154 filter8_1dfunction vp9_filter_block1d4_h2_ssse3;

	155 #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3

	156 #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3

	157 #define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3

	158 #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3

	159 #define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3

	160 #define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3

	161 #define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3

	162 #define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3

	163 #define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3

	164 #define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3

	165 // void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,

	166 // uint8_t *dst, ptrdiff_t dst_stride,

	167 // const int16_t *filter_x, int x_step_q4,

	168 // const int16_t *filter_y, int y_step_q4,

	169 // int w, int h);

	170 // void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,

	171 // uint8_t *dst, ptrdiff_t dst_stride,

	172 // const int16_t *filter_x, int x_step_q4,

	173 // const int16_t *filter_y, int y_step_q4,

	174 // int w, int h);

	175 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);

	176 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);

	177

	178 // void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,

	179 // uint8_t *dst, ptrdiff_t dst_stride,

	180 // const int16_t *filter_x, int x_step_q4,

	181 // const int16_t *filter_y, int y_step_q4,

	182 // int w, int h);

	183 FUN_CONV_2D(, avx2);

	184 #endif

26 #if HAVE_SSSE3	185 #if HAVE_SSSE3

27 filter8_1dfunction vp9_filter_block1d16_v8_ssse3;	186 filter8_1dfunction vp9_filter_block1d16_v8_ssse3;

28 filter8_1dfunction vp9_filter_block1d16_h8_ssse3;	187 filter8_1dfunction vp9_filter_block1d16_h8_ssse3;

29 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;	188 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;

30 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;	189 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;

31 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;	190 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;

32 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;	191 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;

33 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;	192 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;

34 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;	193 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;

35 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;	194 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;

36 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;	195 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;

37 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;	196 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;

38 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;	197 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;

39	198

40 void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,	199 filter8_1dfunction vp9_filter_block1d16_v2_ssse3;

41 uint8_t *dst, ptrdiff_t dst_stride,	200 filter8_1dfunction vp9_filter_block1d16_h2_ssse3;

42 const int16_t *filter_x, int x_step_q4,	201 filter8_1dfunction vp9_filter_block1d8_v2_ssse3;

43 const int16_t *filter_y, int y_step_q4,	202 filter8_1dfunction vp9_filter_block1d8_h2_ssse3;

44 int w, int h) {	203 filter8_1dfunction vp9_filter_block1d4_v2_ssse3;

45 /* Ensure the filter can be compressed to int16_t. */	204 filter8_1dfunction vp9_filter_block1d4_h2_ssse3;

46 if (x_step_q4 == 16 && filter_x[3] != 128) {	205 filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3;

47 while (w >= 16) {	206 filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3;

48 vp9_filter_block1d16_h8_ssse3(src, src_stride,	207 filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3;

49 dst, dst_stride,	208 filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3;

50 h, filter_x);	209 filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3;

51 src += 16;	210 filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3;

52 dst += 16;

53 w -= 16;

54 }

55 while (w >= 8) {

56 vp9_filter_block1d8_h8_ssse3(src, src_stride,

57 dst, dst_stride,

58 h, filter_x);

59 src += 8;

60 dst += 8;

61 w -= 8;

62 }

63 while (w >= 4) {

64 vp9_filter_block1d4_h8_ssse3(src, src_stride,

65 dst, dst_stride,

66 h, filter_x);

67 src += 4;

68 dst += 4;

69 w -= 4;

70 }

71 }

72 if (w) {

73 vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,

74 filter_x, x_step_q4, filter_y, y_step_q4,

75 w, h);

76 }

77 }

78	211

79 void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,	212 // void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,

80 uint8_t *dst, ptrdiff_t dst_stride,	213 // uint8_t *dst, ptrdiff_t dst_stride,

81 const int16_t *filter_x, int x_step_q4,	214 // const int16_t *filter_x, int x_step_q4,

82 const int16_t *filter_y, int y_step_q4,	215 // const int16_t *filter_y, int y_step_q4,

83 int w, int h) {	216 // int w, int h);

84 if (y_step_q4 == 16 && filter_y[3] != 128) {	217 // void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,

85 while (w >= 16) {	218 // uint8_t *dst, ptrdiff_t dst_stride,

86 vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride,	219 // const int16_t *filter_x, int x_step_q4,

87 dst, dst_stride,	220 // const int16_t *filter_y, int y_step_q4,

88 h, filter_y);	221 // int w, int h);

89 src += 16;	222 // void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,

90 dst += 16;	223 // uint8_t *dst, ptrdiff_t dst_stride,

91 w -= 16;	224 // const int16_t *filter_x, int x_step_q4,

92 }	225 // const int16_t *filter_y, int y_step_q4,

93 while (w >= 8) {	226 // int w, int h);

94 vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride,	227 // void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,

95 dst, dst_stride,	228 // uint8_t *dst, ptrdiff_t dst_stride,

96 h, filter_y);	229 // const int16_t *filter_x, int x_step_q4,

97 src += 8;	230 // const int16_t *filter_y, int y_step_q4,

98 dst += 8;	231 // int w, int h);

99 w -= 8;	232 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);

100 }	233 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);

101 while (w >= 4) {	234 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);

102 vp9_filter_block1d4_v8_ssse3(src - src_stride * 3, src_stride,	235 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,

103 dst, dst_stride,	236 ssse3);

104 h, filter_y);

105 src += 4;

106 dst += 4;

107 w -= 4;

108 }

109 }

110 if (w) {

111 vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,

112 filter_x, x_step_q4, filter_y, y_step_q4,

113 w, h);

114 }

115 }

116	237

117 void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,	238 // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,

118 uint8_t *dst, ptrdiff_t dst_stride,	239 // uint8_t *dst, ptrdiff_t dst_stride,

119 const int16_t *filter_x, int x_step_q4,	240 // const int16_t *filter_x, int x_step_q4,

120 const int16_t *filter_y, int y_step_q4,	241 // const int16_t *filter_y, int y_step_q4,

121 int w, int h) {	242 // int w, int h);

122 if (x_step_q4 == 16 && filter_x[3] != 128) {	243 // void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,

123 while (w >= 16) {	244 // uint8_t *dst, ptrdiff_t dst_stride,

124 vp9_filter_block1d16_h8_avg_ssse3(src, src_stride,	245 // const int16_t *filter_x, int x_step_q4,

125 dst, dst_stride,	246 // const int16_t *filter_y, int y_step_q4,

126 h, filter_x);	247 // int w, int h);

127 src += 16;	248 FUN_CONV_2D(, ssse3);

128 dst += 16;	249 FUN_CONV_2D(avg_ , ssse3);

129 w -= 16;

130 }

131 while (w >= 8) {

132 vp9_filter_block1d8_h8_avg_ssse3(src, src_stride,

133 dst, dst_stride,

134 h, filter_x);

135 src += 8;

136 dst += 8;

137 w -= 8;

138 }

139 while (w >= 4) {

140 vp9_filter_block1d4_h8_avg_ssse3(src, src_stride,

141 dst, dst_stride,

142 h, filter_x);

143 src += 4;

144 dst += 4;

145 w -= 4;

146 }

147 }

148 if (w) {

149 vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,

150 filter_x, x_step_q4, filter_y, y_step_q4,

151 w, h);

152 }

153 }

154

155 void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,

156 uint8_t *dst, ptrdiff_t dst_stride,

157 const int16_t *filter_x, int x_step_q4,

158 const int16_t *filter_y, int y_step_q4,

159 int w, int h) {

160 if (y_step_q4 == 16 && filter_y[3] != 128) {

161 while (w >= 16) {

162 vp9_filter_block1d16_v8_avg_ssse3(src - src_stride * 3, src_stride,

163 dst, dst_stride,

164 h, filter_y);

165 src += 16;

166 dst += 16;

167 w -= 16;

168 }

169 while (w >= 8) {

170 vp9_filter_block1d8_v8_avg_ssse3(src - src_stride * 3, src_stride,

171 dst, dst_stride,

172 h, filter_y);

173 src += 8;

174 dst += 8;

175 w -= 8;

176 }

177 while (w >= 4) {

178 vp9_filter_block1d4_v8_avg_ssse3(src - src_stride * 3, src_stride,

179 dst, dst_stride,

180 h, filter_y);

181 src += 4;

182 dst += 4;

183 w -= 4;

184 }

185 }

186 if (w) {

187 vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,

188 filter_x, x_step_q4, filter_y, y_step_q4,

189 w, h);

190 }

191 }

192

193 void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,

194 uint8_t *dst, ptrdiff_t dst_stride,

195 const int16_t *filter_x, int x_step_q4,

196 const int16_t *filter_y, int y_step_q4,

197 int w, int h) {

198 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);

199

200 assert(w <= 64);

201 assert(h <= 64);

202 if (x_step_q4 == 16 && y_step_q4 == 16) {

203 vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64,

204 filter_x, x_step_q4, filter_y, y_step_q4,

205 w, h + 7);

206 vp9_convolve8_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride,

207 filter_x, x_step_q4, filter_y, y_step_q4, w, h);

208 } else {

209 vp9_convolve8_c(src, src_stride, dst, dst_stride,

210 filter_x, x_step_q4, filter_y, y_step_q4, w, h);

211 }

212 }

213

214 void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,

215 uint8_t *dst, ptrdiff_t dst_stride,

216 const int16_t *filter_x, int x_step_q4,

217 const int16_t *filter_y, int y_step_q4,

218 int w, int h) {

219 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);

220

221 assert(w <= 64);

222 assert(h <= 64);

223 if (x_step_q4 == 16 && y_step_q4 == 16) {

224 vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64,

225 filter_x, x_step_q4, filter_y, y_step_q4,

226 w, h + 7);

227 vp9_convolve8_avg_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride,

228 filter_x, x_step_q4, filter_y, y_step_q4,

229 w, h);

230 } else {

231 vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,

232 filter_x, x_step_q4, filter_y, y_step_q4, w, h);

233 }

234 }

235 #endif	250 #endif

236	251

237 #if HAVE_SSE2	252 #if HAVE_SSE2

238 filter8_1dfunction vp9_filter_block1d16_v8_sse2;	253 filter8_1dfunction vp9_filter_block1d16_v8_sse2;

239 filter8_1dfunction vp9_filter_block1d16_h8_sse2;	254 filter8_1dfunction vp9_filter_block1d16_h8_sse2;

240 filter8_1dfunction vp9_filter_block1d8_v8_sse2;	255 filter8_1dfunction vp9_filter_block1d8_v8_sse2;

241 filter8_1dfunction vp9_filter_block1d8_h8_sse2;	256 filter8_1dfunction vp9_filter_block1d8_h8_sse2;

242 filter8_1dfunction vp9_filter_block1d4_v8_sse2;	257 filter8_1dfunction vp9_filter_block1d4_v8_sse2;

243 filter8_1dfunction vp9_filter_block1d4_h8_sse2;	258 filter8_1dfunction vp9_filter_block1d4_h8_sse2;

244 filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2;	259 filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2;

245 filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2;	260 filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2;

246 filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2;	261 filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2;

247 filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2;	262 filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2;

248 filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2;	263 filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2;

249 filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2;	264 filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2;

250	265

251 void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,	266 filter8_1dfunction vp9_filter_block1d16_v2_sse2;

252 uint8_t *dst, ptrdiff_t dst_stride,	267 filter8_1dfunction vp9_filter_block1d16_h2_sse2;

253 const int16_t *filter_x, int x_step_q4,	268 filter8_1dfunction vp9_filter_block1d8_v2_sse2;

254 const int16_t *filter_y, int y_step_q4,	269 filter8_1dfunction vp9_filter_block1d8_h2_sse2;

255 int w, int h) {	270 filter8_1dfunction vp9_filter_block1d4_v2_sse2;

256 /* Ensure the filter can be compressed to int16_t. */	271 filter8_1dfunction vp9_filter_block1d4_h2_sse2;

257 if (x_step_q4 == 16 && filter_x[3] != 128) {	272 filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2;

258 while (w >= 16) {	273 filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2;

259 vp9_filter_block1d16_h8_sse2(src, src_stride,	274 filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2;

260 dst, dst_stride,	275 filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2;

261 h, filter_x);	276 filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2;

262 src += 16;	277 filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2;

263 dst += 16;

264 w -= 16;

265 }

266 while (w >= 8) {

267 vp9_filter_block1d8_h8_sse2(src, src_stride,

268 dst, dst_stride,

269 h, filter_x);

270 src += 8;

271 dst += 8;

272 w -= 8;

273 }

274 while (w >= 4) {

275 vp9_filter_block1d4_h8_sse2(src, src_stride,

276 dst, dst_stride,

277 h, filter_x);

278 src += 4;

279 dst += 4;

280 w -= 4;

281 }

282 }

283 if (w) {

284 vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,

285 filter_x, x_step_q4, filter_y, y_step_q4,

286 w, h);

287 }

288 }

289	278

290 void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,	279 // void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,

291 uint8_t *dst, ptrdiff_t dst_stride,	280 // uint8_t *dst, ptrdiff_t dst_stride,

292 const int16_t *filter_x, int x_step_q4,	281 // const int16_t *filter_x, int x_step_q4,

293 const int16_t *filter_y, int y_step_q4,	282 // const int16_t *filter_y, int y_step_q4,

294 int w, int h) {	283 // int w, int h);

295 if (y_step_q4 == 16 && filter_y[3] != 128) {	284 // void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,

296 while (w >= 16) {	285 // uint8_t *dst, ptrdiff_t dst_stride,

297 vp9_filter_block1d16_v8_sse2(src - src_stride * 3, src_stride,	286 // const int16_t *filter_x, int x_step_q4,

298 dst, dst_stride,	287 // const int16_t *filter_y, int y_step_q4,

299 h, filter_y);	288 // int w, int h);

300 src += 16;	289 // void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,

301 dst += 16;	290 // uint8_t *dst, ptrdiff_t dst_stride,

302 w -= 16;	291 // const int16_t *filter_x, int x_step_q4,

303 }	292 // const int16_t *filter_y, int y_step_q4,

304 while (w >= 8) {	293 // int w, int h);

305 vp9_filter_block1d8_v8_sse2(src - src_stride * 3, src_stride,	294 // void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,

306 dst, dst_stride,	295 // uint8_t *dst, ptrdiff_t dst_stride,

307 h, filter_y);	296 // const int16_t *filter_x, int x_step_q4,

308 src += 8;	297 // const int16_t *filter_y, int y_step_q4,

309 dst += 8;	298 // int w, int h);

310 w -= 8;	299 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);

311 }	300 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);

312 while (w >= 4) {	301 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);

313 vp9_filter_block1d4_v8_sse2(src - src_stride * 3, src_stride,	302 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);

314 dst, dst_stride,

315 h, filter_y);

316 src += 4;

317 dst += 4;

318 w -= 4;

319 }

320 }

321 if (w) {

322 vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,

323 filter_x, x_step_q4, filter_y, y_step_q4,

324 w, h);

325 }

326 }

327	303

328 void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,	304 // void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,

329 uint8_t *dst, ptrdiff_t dst_stride,	305 // uint8_t *dst, ptrdiff_t dst_stride,

330 const int16_t *filter_x, int x_step_q4,	306 // const int16_t *filter_x, int x_step_q4,

331 const int16_t *filter_y, int y_step_q4,	307 // const int16_t *filter_y, int y_step_q4,

332 int w, int h) {	308 // int w, int h);

333 if (x_step_q4 == 16 && filter_x[3] != 128) {	309 // void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,

334 while (w >= 16) {	310 // uint8_t *dst, ptrdiff_t dst_stride,

335 vp9_filter_block1d16_h8_avg_sse2(src, src_stride,	311 // const int16_t *filter_x, int x_step_q4,

336 dst, dst_stride,	312 // const int16_t *filter_y, int y_step_q4,

337 h, filter_x);	313 // int w, int h);

338 src += 16;	314 FUN_CONV_2D(, sse2);

339 dst += 16;	315 FUN_CONV_2D(avg_ , sse2);

340 w -= 16;

341 }

342 while (w >= 8) {

343 vp9_filter_block1d8_h8_avg_sse2(src, src_stride,

344 dst, dst_stride,

345 h, filter_x);

346 src += 8;

347 dst += 8;

348 w -= 8;

349 }

350 while (w >= 4) {

351 vp9_filter_block1d4_h8_avg_sse2(src, src_stride,

352 dst, dst_stride,

353 h, filter_x);

354 src += 4;

355 dst += 4;

356 w -= 4;

357 }

358 }

359 if (w) {

360 vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,

361 filter_x, x_step_q4, filter_y, y_step_q4,

362 w, h);

363 }

364 }

365

366 void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,

367 uint8_t *dst, ptrdiff_t dst_stride,

368 const int16_t *filter_x, int x_step_q4,

369 const int16_t *filter_y, int y_step_q4,

370 int w, int h) {

371 if (y_step_q4 == 16 && filter_y[3] != 128) {

372 while (w >= 16) {

373 vp9_filter_block1d16_v8_avg_sse2(src - src_stride * 3, src_stride,

374 dst, dst_stride,

375 h, filter_y);

376 src += 16;

377 dst += 16;

378 w -= 16;

379 }

380 while (w >= 8) {

381 vp9_filter_block1d8_v8_avg_sse2(src - src_stride * 3, src_stride,

382 dst, dst_stride,

383 h, filter_y);

384 src += 8;

385 dst += 8;

386 w -= 8;

387 }

388 while (w >= 4) {

389 vp9_filter_block1d4_v8_avg_sse2(src - src_stride * 3, src_stride,

390 dst, dst_stride,

391 h, filter_y);

392 src += 4;

393 dst += 4;

394 w -= 4;

395 }

396 }

397 if (w) {

398 vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,

399 filter_x, x_step_q4, filter_y, y_step_q4,

400 w, h);

401 }

402 }

403

404 void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,

405 uint8_t *dst, ptrdiff_t dst_stride,

406 const int16_t *filter_x, int x_step_q4,

407 const int16_t *filter_y, int y_step_q4,

408 int w, int h) {

409 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);

410

411 assert(w <= 64);

412 assert(h <= 64);

413 if (x_step_q4 == 16 && y_step_q4 == 16) {

414 vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64,

415 filter_x, x_step_q4, filter_y, y_step_q4,

416 w, h + 7);

417 vp9_convolve8_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride,

418 filter_x, x_step_q4, filter_y, y_step_q4, w, h);

419 } else {

420 vp9_convolve8_c(src, src_stride, dst, dst_stride,

421 filter_x, x_step_q4, filter_y, y_step_q4, w, h);

422 }

423 }

424

425 void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,

426 uint8_t *dst, ptrdiff_t dst_stride,

427 const int16_t *filter_x, int x_step_q4,

428 const int16_t *filter_y, int y_step_q4,

429 int w, int h) {

430 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);

431

432 assert(w <= 64);

433 assert(h <= 64);

434 if (x_step_q4 == 16 && y_step_q4 == 16) {

435 vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64,

436 filter_x, x_step_q4, filter_y, y_step_q4,

437 w, h + 7);

438 vp9_convolve8_avg_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride,

439 filter_x, x_step_q4, filter_y, y_step_q4,

440 w, h);

441 } else {

442 vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,

443 filter_x, x_step_q4, filter_y, y_step_q4, w, h);

444 }

445 }

446 #endif	316 #endif

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/common/vp9_systemdependent.h ('k') | source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c » ('j') | no next file with comments »