Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(332)

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_asm_stubs.c

Issue 168343002: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: libvpx: Pull from upstream Created 6 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include <assert.h> 11 #include <assert.h>
12 12
13 #include "./vpx_config.h" 13 #include "./vpx_config.h"
14 #include "./vp9_rtcd.h" 14 #include "./vp9_rtcd.h"
15 #include "vpx_ports/mem.h" 15 #include "vpx_ports/mem.h"
16 16
17 typedef void filter8_1dfunction ( 17 typedef void filter8_1dfunction (
18 const unsigned char *src_ptr, 18 const unsigned char *src_ptr,
19 const unsigned int src_pitch, 19 const ptrdiff_t src_pitch,
20 unsigned char *output_ptr, 20 unsigned char *output_ptr,
21 unsigned int out_pitch, 21 ptrdiff_t out_pitch,
22 unsigned int output_height, 22 unsigned int output_height,
23 const short *filter 23 const short *filter
24 ); 24 );
25 25
26 #define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
27 void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
28 uint8_t *dst, ptrdiff_t dst_stride, \
29 const int16_t *filter_x, int x_step_q4, \
30 const int16_t *filter_y, int y_step_q4, \
31 int w, int h) { \
32 if (step_q4 == 16 && filter[3] != 128) { \
33 if (filter[0] || filter[1] || filter[2]) { \
34 while (w >= 16) { \
35 vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \
36 src_stride, \
37 dst, \
38 dst_stride, \
39 h, \
40 filter); \
41 src += 16; \
42 dst += 16; \
43 w -= 16; \
44 } \
45 while (w >= 8) { \
46 vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \
47 src_stride, \
48 dst, \
49 dst_stride, \
50 h, \
51 filter); \
52 src += 8; \
53 dst += 8; \
54 w -= 8; \
55 } \
56 while (w >= 4) { \
57 vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \
58 src_stride, \
59 dst, \
60 dst_stride, \
61 h, \
62 filter); \
63 src += 4; \
64 dst += 4; \
65 w -= 4; \
66 } \
67 } else { \
68 while (w >= 16) { \
69 vp9_filter_block1d16_##dir##2_##avg##opt(src, \
70 src_stride, \
71 dst, \
72 dst_stride, \
73 h, \
74 filter); \
75 src += 16; \
76 dst += 16; \
77 w -= 16; \
78 } \
79 while (w >= 8) { \
80 vp9_filter_block1d8_##dir##2_##avg##opt(src, \
81 src_stride, \
82 dst, \
83 dst_stride, \
84 h, \
85 filter); \
86 src += 8; \
87 dst += 8; \
88 w -= 8; \
89 } \
90 while (w >= 4) { \
91 vp9_filter_block1d4_##dir##2_##avg##opt(src, \
92 src_stride, \
93 dst, \
94 dst_stride, \
95 h, \
96 filter); \
97 src += 4; \
98 dst += 4; \
99 w -= 4; \
100 } \
101 } \
102 } \
103 if (w) { \
104 vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
105 filter_x, x_step_q4, filter_y, y_step_q4, \
106 w, h); \
107 } \
108 }
109
110 #define FUN_CONV_2D(avg, opt) \
111 void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
112 uint8_t *dst, ptrdiff_t dst_stride, \
113 const int16_t *filter_x, int x_step_q4, \
114 const int16_t *filter_y, int y_step_q4, \
115 int w, int h) { \
116 assert(w <= 64); \
117 assert(h <= 64); \
118 if (x_step_q4 == 16 && y_step_q4 == 16) { \
119 if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
120 filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
121 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); \
122 vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
123 filter_x, x_step_q4, filter_y, y_step_q4, \
124 w, h + 7); \
125 vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
126 filter_x, x_step_q4, filter_y, \
127 y_step_q4, w, h); \
128 } else { \
129 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 65); \
130 vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
131 filter_x, x_step_q4, filter_y, y_step_q4, \
132 w, h + 1); \
133 vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
134 filter_x, x_step_q4, filter_y, \
135 y_step_q4, w, h); \
136 } \
137 } else { \
138 vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
139 filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
140 } \
141 }
142 #if HAVE_AVX2
143 filter8_1dfunction vp9_filter_block1d16_v8_avx2;
144 filter8_1dfunction vp9_filter_block1d16_h8_avx2;
145 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
146 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
147 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
148 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
149 filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
150 filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
151 filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
152 filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
153 filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
154 filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
155 #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
156 #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
157 #define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3
158 #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
159 #define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3
160 #define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3
161 #define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3
162 #define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3
163 #define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3
164 #define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3
165 // void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
166 // uint8_t *dst, ptrdiff_t dst_stride,
167 // const int16_t *filter_x, int x_step_q4,
168 // const int16_t *filter_y, int y_step_q4,
169 // int w, int h);
170 // void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
171 // uint8_t *dst, ptrdiff_t dst_stride,
172 // const int16_t *filter_x, int x_step_q4,
173 // const int16_t *filter_y, int y_step_q4,
174 // int w, int h);
175 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
176 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
177
178 // void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
179 // uint8_t *dst, ptrdiff_t dst_stride,
180 // const int16_t *filter_x, int x_step_q4,
181 // const int16_t *filter_y, int y_step_q4,
182 // int w, int h);
183 FUN_CONV_2D(, avx2);
184 #endif
26 #if HAVE_SSSE3 185 #if HAVE_SSSE3
27 filter8_1dfunction vp9_filter_block1d16_v8_ssse3; 186 filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
28 filter8_1dfunction vp9_filter_block1d16_h8_ssse3; 187 filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
29 filter8_1dfunction vp9_filter_block1d8_v8_ssse3; 188 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
30 filter8_1dfunction vp9_filter_block1d8_h8_ssse3; 189 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
31 filter8_1dfunction vp9_filter_block1d4_v8_ssse3; 190 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
32 filter8_1dfunction vp9_filter_block1d4_h8_ssse3; 191 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
33 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; 192 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
34 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; 193 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
35 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; 194 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
36 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; 195 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
37 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; 196 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
38 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; 197 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
39 198
40 void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, 199 filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
41 uint8_t *dst, ptrdiff_t dst_stride, 200 filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
42 const int16_t *filter_x, int x_step_q4, 201 filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
43 const int16_t *filter_y, int y_step_q4, 202 filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
44 int w, int h) { 203 filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
45 /* Ensure the filter can be compressed to int16_t. */ 204 filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
46 if (x_step_q4 == 16 && filter_x[3] != 128) { 205 filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3;
47 while (w >= 16) { 206 filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3;
48 vp9_filter_block1d16_h8_ssse3(src, src_stride, 207 filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3;
49 dst, dst_stride, 208 filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3;
50 h, filter_x); 209 filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3;
51 src += 16; 210 filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3;
52 dst += 16;
53 w -= 16;
54 }
55 while (w >= 8) {
56 vp9_filter_block1d8_h8_ssse3(src, src_stride,
57 dst, dst_stride,
58 h, filter_x);
59 src += 8;
60 dst += 8;
61 w -= 8;
62 }
63 while (w >= 4) {
64 vp9_filter_block1d4_h8_ssse3(src, src_stride,
65 dst, dst_stride,
66 h, filter_x);
67 src += 4;
68 dst += 4;
69 w -= 4;
70 }
71 }
72 if (w) {
73 vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
74 filter_x, x_step_q4, filter_y, y_step_q4,
75 w, h);
76 }
77 }
78 211
79 void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, 212 // void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
80 uint8_t *dst, ptrdiff_t dst_stride, 213 // uint8_t *dst, ptrdiff_t dst_stride,
81 const int16_t *filter_x, int x_step_q4, 214 // const int16_t *filter_x, int x_step_q4,
82 const int16_t *filter_y, int y_step_q4, 215 // const int16_t *filter_y, int y_step_q4,
83 int w, int h) { 216 // int w, int h);
84 if (y_step_q4 == 16 && filter_y[3] != 128) { 217 // void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
85 while (w >= 16) { 218 // uint8_t *dst, ptrdiff_t dst_stride,
86 vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride, 219 // const int16_t *filter_x, int x_step_q4,
87 dst, dst_stride, 220 // const int16_t *filter_y, int y_step_q4,
88 h, filter_y); 221 // int w, int h);
89 src += 16; 222 // void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
90 dst += 16; 223 // uint8_t *dst, ptrdiff_t dst_stride,
91 w -= 16; 224 // const int16_t *filter_x, int x_step_q4,
92 } 225 // const int16_t *filter_y, int y_step_q4,
93 while (w >= 8) { 226 // int w, int h);
94 vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride, 227 // void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
95 dst, dst_stride, 228 // uint8_t *dst, ptrdiff_t dst_stride,
96 h, filter_y); 229 // const int16_t *filter_x, int x_step_q4,
97 src += 8; 230 // const int16_t *filter_y, int y_step_q4,
98 dst += 8; 231 // int w, int h);
99 w -= 8; 232 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
100 } 233 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
101 while (w >= 4) { 234 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
102 vp9_filter_block1d4_v8_ssse3(src - src_stride * 3, src_stride, 235 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
103 dst, dst_stride, 236 ssse3);
104 h, filter_y);
105 src += 4;
106 dst += 4;
107 w -= 4;
108 }
109 }
110 if (w) {
111 vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
112 filter_x, x_step_q4, filter_y, y_step_q4,
113 w, h);
114 }
115 }
116 237
117 void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, 238 // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
118 uint8_t *dst, ptrdiff_t dst_stride, 239 // uint8_t *dst, ptrdiff_t dst_stride,
119 const int16_t *filter_x, int x_step_q4, 240 // const int16_t *filter_x, int x_step_q4,
120 const int16_t *filter_y, int y_step_q4, 241 // const int16_t *filter_y, int y_step_q4,
121 int w, int h) { 242 // int w, int h);
122 if (x_step_q4 == 16 && filter_x[3] != 128) { 243 // void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
123 while (w >= 16) { 244 // uint8_t *dst, ptrdiff_t dst_stride,
124 vp9_filter_block1d16_h8_avg_ssse3(src, src_stride, 245 // const int16_t *filter_x, int x_step_q4,
125 dst, dst_stride, 246 // const int16_t *filter_y, int y_step_q4,
126 h, filter_x); 247 // int w, int h);
127 src += 16; 248 FUN_CONV_2D(, ssse3);
128 dst += 16; 249 FUN_CONV_2D(avg_ , ssse3);
129 w -= 16;
130 }
131 while (w >= 8) {
132 vp9_filter_block1d8_h8_avg_ssse3(src, src_stride,
133 dst, dst_stride,
134 h, filter_x);
135 src += 8;
136 dst += 8;
137 w -= 8;
138 }
139 while (w >= 4) {
140 vp9_filter_block1d4_h8_avg_ssse3(src, src_stride,
141 dst, dst_stride,
142 h, filter_x);
143 src += 4;
144 dst += 4;
145 w -= 4;
146 }
147 }
148 if (w) {
149 vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
150 filter_x, x_step_q4, filter_y, y_step_q4,
151 w, h);
152 }
153 }
154
155 void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
156 uint8_t *dst, ptrdiff_t dst_stride,
157 const int16_t *filter_x, int x_step_q4,
158 const int16_t *filter_y, int y_step_q4,
159 int w, int h) {
160 if (y_step_q4 == 16 && filter_y[3] != 128) {
161 while (w >= 16) {
162 vp9_filter_block1d16_v8_avg_ssse3(src - src_stride * 3, src_stride,
163 dst, dst_stride,
164 h, filter_y);
165 src += 16;
166 dst += 16;
167 w -= 16;
168 }
169 while (w >= 8) {
170 vp9_filter_block1d8_v8_avg_ssse3(src - src_stride * 3, src_stride,
171 dst, dst_stride,
172 h, filter_y);
173 src += 8;
174 dst += 8;
175 w -= 8;
176 }
177 while (w >= 4) {
178 vp9_filter_block1d4_v8_avg_ssse3(src - src_stride * 3, src_stride,
179 dst, dst_stride,
180 h, filter_y);
181 src += 4;
182 dst += 4;
183 w -= 4;
184 }
185 }
186 if (w) {
187 vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
188 filter_x, x_step_q4, filter_y, y_step_q4,
189 w, h);
190 }
191 }
192
193 void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
194 uint8_t *dst, ptrdiff_t dst_stride,
195 const int16_t *filter_x, int x_step_q4,
196 const int16_t *filter_y, int y_step_q4,
197 int w, int h) {
198 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
199
200 assert(w <= 64);
201 assert(h <= 64);
202 if (x_step_q4 == 16 && y_step_q4 == 16) {
203 vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64,
204 filter_x, x_step_q4, filter_y, y_step_q4,
205 w, h + 7);
206 vp9_convolve8_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride,
207 filter_x, x_step_q4, filter_y, y_step_q4, w, h);
208 } else {
209 vp9_convolve8_c(src, src_stride, dst, dst_stride,
210 filter_x, x_step_q4, filter_y, y_step_q4, w, h);
211 }
212 }
213
214 void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
215 uint8_t *dst, ptrdiff_t dst_stride,
216 const int16_t *filter_x, int x_step_q4,
217 const int16_t *filter_y, int y_step_q4,
218 int w, int h) {
219 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
220
221 assert(w <= 64);
222 assert(h <= 64);
223 if (x_step_q4 == 16 && y_step_q4 == 16) {
224 vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64,
225 filter_x, x_step_q4, filter_y, y_step_q4,
226 w, h + 7);
227 vp9_convolve8_avg_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride,
228 filter_x, x_step_q4, filter_y, y_step_q4,
229 w, h);
230 } else {
231 vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
232 filter_x, x_step_q4, filter_y, y_step_q4, w, h);
233 }
234 }
235 #endif 250 #endif
236 251
237 #if HAVE_SSE2 252 #if HAVE_SSE2
238 filter8_1dfunction vp9_filter_block1d16_v8_sse2; 253 filter8_1dfunction vp9_filter_block1d16_v8_sse2;
239 filter8_1dfunction vp9_filter_block1d16_h8_sse2; 254 filter8_1dfunction vp9_filter_block1d16_h8_sse2;
240 filter8_1dfunction vp9_filter_block1d8_v8_sse2; 255 filter8_1dfunction vp9_filter_block1d8_v8_sse2;
241 filter8_1dfunction vp9_filter_block1d8_h8_sse2; 256 filter8_1dfunction vp9_filter_block1d8_h8_sse2;
242 filter8_1dfunction vp9_filter_block1d4_v8_sse2; 257 filter8_1dfunction vp9_filter_block1d4_v8_sse2;
243 filter8_1dfunction vp9_filter_block1d4_h8_sse2; 258 filter8_1dfunction vp9_filter_block1d4_h8_sse2;
244 filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2; 259 filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2;
245 filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2; 260 filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2;
246 filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2; 261 filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2;
247 filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2; 262 filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2;
248 filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2; 263 filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2;
249 filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2; 264 filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2;
250 265
251 void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, 266 filter8_1dfunction vp9_filter_block1d16_v2_sse2;
252 uint8_t *dst, ptrdiff_t dst_stride, 267 filter8_1dfunction vp9_filter_block1d16_h2_sse2;
253 const int16_t *filter_x, int x_step_q4, 268 filter8_1dfunction vp9_filter_block1d8_v2_sse2;
254 const int16_t *filter_y, int y_step_q4, 269 filter8_1dfunction vp9_filter_block1d8_h2_sse2;
255 int w, int h) { 270 filter8_1dfunction vp9_filter_block1d4_v2_sse2;
256 /* Ensure the filter can be compressed to int16_t. */ 271 filter8_1dfunction vp9_filter_block1d4_h2_sse2;
257 if (x_step_q4 == 16 && filter_x[3] != 128) { 272 filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2;
258 while (w >= 16) { 273 filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2;
259 vp9_filter_block1d16_h8_sse2(src, src_stride, 274 filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2;
260 dst, dst_stride, 275 filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2;
261 h, filter_x); 276 filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2;
262 src += 16; 277 filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2;
263 dst += 16;
264 w -= 16;
265 }
266 while (w >= 8) {
267 vp9_filter_block1d8_h8_sse2(src, src_stride,
268 dst, dst_stride,
269 h, filter_x);
270 src += 8;
271 dst += 8;
272 w -= 8;
273 }
274 while (w >= 4) {
275 vp9_filter_block1d4_h8_sse2(src, src_stride,
276 dst, dst_stride,
277 h, filter_x);
278 src += 4;
279 dst += 4;
280 w -= 4;
281 }
282 }
283 if (w) {
284 vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
285 filter_x, x_step_q4, filter_y, y_step_q4,
286 w, h);
287 }
288 }
289 278
290 void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, 279 // void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
291 uint8_t *dst, ptrdiff_t dst_stride, 280 // uint8_t *dst, ptrdiff_t dst_stride,
292 const int16_t *filter_x, int x_step_q4, 281 // const int16_t *filter_x, int x_step_q4,
293 const int16_t *filter_y, int y_step_q4, 282 // const int16_t *filter_y, int y_step_q4,
294 int w, int h) { 283 // int w, int h);
295 if (y_step_q4 == 16 && filter_y[3] != 128) { 284 // void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
296 while (w >= 16) { 285 // uint8_t *dst, ptrdiff_t dst_stride,
297 vp9_filter_block1d16_v8_sse2(src - src_stride * 3, src_stride, 286 // const int16_t *filter_x, int x_step_q4,
298 dst, dst_stride, 287 // const int16_t *filter_y, int y_step_q4,
299 h, filter_y); 288 // int w, int h);
300 src += 16; 289 // void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
301 dst += 16; 290 // uint8_t *dst, ptrdiff_t dst_stride,
302 w -= 16; 291 // const int16_t *filter_x, int x_step_q4,
303 } 292 // const int16_t *filter_y, int y_step_q4,
304 while (w >= 8) { 293 // int w, int h);
305 vp9_filter_block1d8_v8_sse2(src - src_stride * 3, src_stride, 294 // void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
306 dst, dst_stride, 295 // uint8_t *dst, ptrdiff_t dst_stride,
307 h, filter_y); 296 // const int16_t *filter_x, int x_step_q4,
308 src += 8; 297 // const int16_t *filter_y, int y_step_q4,
309 dst += 8; 298 // int w, int h);
310 w -= 8; 299 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
311 } 300 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
312 while (w >= 4) { 301 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
313 vp9_filter_block1d4_v8_sse2(src - src_stride * 3, src_stride, 302 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
314 dst, dst_stride,
315 h, filter_y);
316 src += 4;
317 dst += 4;
318 w -= 4;
319 }
320 }
321 if (w) {
322 vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
323 filter_x, x_step_q4, filter_y, y_step_q4,
324 w, h);
325 }
326 }
327 303
328 void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, 304 // void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
329 uint8_t *dst, ptrdiff_t dst_stride, 305 // uint8_t *dst, ptrdiff_t dst_stride,
330 const int16_t *filter_x, int x_step_q4, 306 // const int16_t *filter_x, int x_step_q4,
331 const int16_t *filter_y, int y_step_q4, 307 // const int16_t *filter_y, int y_step_q4,
332 int w, int h) { 308 // int w, int h);
333 if (x_step_q4 == 16 && filter_x[3] != 128) { 309 // void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
334 while (w >= 16) { 310 // uint8_t *dst, ptrdiff_t dst_stride,
335 vp9_filter_block1d16_h8_avg_sse2(src, src_stride, 311 // const int16_t *filter_x, int x_step_q4,
336 dst, dst_stride, 312 // const int16_t *filter_y, int y_step_q4,
337 h, filter_x); 313 // int w, int h);
338 src += 16; 314 FUN_CONV_2D(, sse2);
339 dst += 16; 315 FUN_CONV_2D(avg_ , sse2);
340 w -= 16;
341 }
342 while (w >= 8) {
343 vp9_filter_block1d8_h8_avg_sse2(src, src_stride,
344 dst, dst_stride,
345 h, filter_x);
346 src += 8;
347 dst += 8;
348 w -= 8;
349 }
350 while (w >= 4) {
351 vp9_filter_block1d4_h8_avg_sse2(src, src_stride,
352 dst, dst_stride,
353 h, filter_x);
354 src += 4;
355 dst += 4;
356 w -= 4;
357 }
358 }
359 if (w) {
360 vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
361 filter_x, x_step_q4, filter_y, y_step_q4,
362 w, h);
363 }
364 }
365
366 void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
367 uint8_t *dst, ptrdiff_t dst_stride,
368 const int16_t *filter_x, int x_step_q4,
369 const int16_t *filter_y, int y_step_q4,
370 int w, int h) {
371 if (y_step_q4 == 16 && filter_y[3] != 128) {
372 while (w >= 16) {
373 vp9_filter_block1d16_v8_avg_sse2(src - src_stride * 3, src_stride,
374 dst, dst_stride,
375 h, filter_y);
376 src += 16;
377 dst += 16;
378 w -= 16;
379 }
380 while (w >= 8) {
381 vp9_filter_block1d8_v8_avg_sse2(src - src_stride * 3, src_stride,
382 dst, dst_stride,
383 h, filter_y);
384 src += 8;
385 dst += 8;
386 w -= 8;
387 }
388 while (w >= 4) {
389 vp9_filter_block1d4_v8_avg_sse2(src - src_stride * 3, src_stride,
390 dst, dst_stride,
391 h, filter_y);
392 src += 4;
393 dst += 4;
394 w -= 4;
395 }
396 }
397 if (w) {
398 vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
399 filter_x, x_step_q4, filter_y, y_step_q4,
400 w, h);
401 }
402 }
403
404 void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
405 uint8_t *dst, ptrdiff_t dst_stride,
406 const int16_t *filter_x, int x_step_q4,
407 const int16_t *filter_y, int y_step_q4,
408 int w, int h) {
409 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
410
411 assert(w <= 64);
412 assert(h <= 64);
413 if (x_step_q4 == 16 && y_step_q4 == 16) {
414 vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64,
415 filter_x, x_step_q4, filter_y, y_step_q4,
416 w, h + 7);
417 vp9_convolve8_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride,
418 filter_x, x_step_q4, filter_y, y_step_q4, w, h);
419 } else {
420 vp9_convolve8_c(src, src_stride, dst, dst_stride,
421 filter_x, x_step_q4, filter_y, y_step_q4, w, h);
422 }
423 }
424
425 void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
426 uint8_t *dst, ptrdiff_t dst_stride,
427 const int16_t *filter_x, int x_step_q4,
428 const int16_t *filter_y, int y_step_q4,
429 int w, int h) {
430 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
431
432 assert(w <= 64);
433 assert(h <= 64);
434 if (x_step_q4 == 16 && y_step_q4 == 16) {
435 vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64,
436 filter_x, x_step_q4, filter_y, y_step_q4,
437 w, h + 7);
438 vp9_convolve8_avg_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride,
439 filter_x, x_step_q4, filter_y, y_step_q4,
440 w, h);
441 } else {
442 vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
443 filter_x, x_step_q4, filter_y, y_step_q4, w, h);
444 }
445 }
446 #endif 316 #endif
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/vp9_systemdependent.h ('k') | source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698