OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include <assert.h> | 11 #include <assert.h> |
12 | 12 |
13 #include "./vpx_config.h" | 13 #include "./vpx_config.h" |
14 #include "./vp9_rtcd.h" | 14 #include "./vp9_rtcd.h" |
15 #include "vpx_ports/mem.h" | 15 #include "vpx_ports/mem.h" |
16 | |
17 /////////////////////////////////////////////////////////////////////////// | 16 /////////////////////////////////////////////////////////////////////////// |
18 // the mmx function that does the bilinear filtering and var calculation // | 17 // the mmx function that does the bilinear filtering and var calculation // |
19 // int one pass // | 18 // int one pass // |
20 /////////////////////////////////////////////////////////////////////////// | 19 /////////////////////////////////////////////////////////////////////////// |
21 DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = { | 20 DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = { |
22 { 128, 128, 128, 128, 0, 0, 0, 0 }, | 21 { 128, 128, 128, 128, 0, 0, 0, 0 }, |
23 { 120, 120, 120, 120, 8, 8, 8, 8 }, | 22 { 120, 120, 120, 120, 8, 8, 8, 8 }, |
24 { 112, 112, 112, 112, 16, 16, 16, 16 }, | 23 { 112, 112, 112, 112, 16, 16, 16, 16 }, |
25 { 104, 104, 104, 104, 24, 24, 24, 24 }, | 24 { 104, 104, 104, 104, 24, 24, 24, 24 }, |
26 { 96, 96, 96, 96, 32, 32, 32, 32 }, | 25 { 96, 96, 96, 96, 32, 32, 32, 32 }, |
27 { 88, 88, 88, 88, 40, 40, 40, 40 }, | 26 { 88, 88, 88, 88, 40, 40, 40, 40 }, |
28 { 80, 80, 80, 80, 48, 48, 48, 48 }, | 27 { 80, 80, 80, 80, 48, 48, 48, 48 }, |
29 { 72, 72, 72, 72, 56, 56, 56, 56 }, | 28 { 72, 72, 72, 72, 56, 56, 56, 56 }, |
30 { 64, 64, 64, 64, 64, 64, 64, 64 }, | 29 { 64, 64, 64, 64, 64, 64, 64, 64 }, |
31 { 56, 56, 56, 56, 72, 72, 72, 72 }, | 30 { 56, 56, 56, 56, 72, 72, 72, 72 }, |
32 { 48, 48, 48, 48, 80, 80, 80, 80 }, | 31 { 48, 48, 48, 48, 80, 80, 80, 80 }, |
33 { 40, 40, 40, 40, 88, 88, 88, 88 }, | 32 { 40, 40, 40, 40, 88, 88, 88, 88 }, |
34 { 32, 32, 32, 32, 96, 96, 96, 96 }, | 33 { 32, 32, 32, 32, 96, 96, 96, 96 }, |
35 { 24, 24, 24, 24, 104, 104, 104, 104 }, | 34 { 24, 24, 24, 24, 104, 104, 104, 104 }, |
36 { 16, 16, 16, 16, 112, 112, 112, 112 }, | 35 { 16, 16, 16, 16, 112, 112, 112, 112 }, |
37 { 8, 8, 8, 8, 120, 120, 120, 120 } | 36 { 8, 8, 8, 8, 120, 120, 120, 120 } |
38 }; | 37 }; |
39 | 38 |
40 | |
41 int num_func_entry = 0; | |
42 #if HAVE_SSSE3 | 39 #if HAVE_SSSE3 |
43 void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr, | 40 void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr, |
44 const unsigned int src_pitch, | 41 const unsigned int src_pitch, |
45 unsigned char *output_ptr, | 42 unsigned char *output_ptr, |
46 unsigned int out_pitch, | 43 unsigned int out_pitch, |
47 unsigned int output_height, | 44 unsigned int output_height, |
48 const short *filter); | 45 const short *filter); |
49 | 46 |
50 void vp9_filter_block1d16_v8_intrin_ssse3(const unsigned char *src_ptr, | |
51 const unsigned int src_pitch, | |
52 unsigned char *output_ptr, | |
53 unsigned int out_pitch, | |
54 unsigned int output_height, | |
55 const short *filter); | |
56 | |
57 void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr, | 47 void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr, |
58 const unsigned int src_pitch, | 48 const unsigned int src_pitch, |
59 unsigned char *output_ptr, | 49 unsigned char *output_ptr, |
60 unsigned int out_pitch, | 50 unsigned int out_pitch, |
61 unsigned int output_height, | 51 unsigned int output_height, |
62 const short *filter); | 52 const short *filter); |
63 | 53 |
64 void vp9_filter_block1d16_h8_intrin_ssse3(const unsigned char *src_ptr, | |
65 const unsigned int src_pitch, | |
66 unsigned char *output_ptr, | |
67 unsigned int out_pitch, | |
68 unsigned int output_height, | |
69 const short *filter); | |
70 | |
71 void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr, | 54 void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr, |
72 const unsigned int src_pitch, | 55 const unsigned int src_pitch, |
73 unsigned char *output_ptr, | 56 unsigned char *output_ptr, |
74 unsigned int out_pitch, | 57 unsigned int out_pitch, |
75 unsigned int output_height, | 58 unsigned int output_height, |
76 const short *filter); | 59 const short *filter); |
77 | 60 |
78 void vp9_filter_block1d8_v8_intrin_ssse3(const unsigned char *src_ptr, | |
79 const unsigned int src_pitch, | |
80 unsigned char *output_ptr, | |
81 unsigned int out_pitch, | |
82 unsigned int output_height, | |
83 const short *filter); | |
84 | |
85 void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr, | 61 void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr, |
86 const unsigned int src_pitch, | 62 const unsigned int src_pitch, |
87 unsigned char *output_ptr, | 63 unsigned char *output_ptr, |
88 unsigned int out_pitch, | 64 unsigned int out_pitch, |
89 unsigned int output_height, | 65 unsigned int output_height, |
90 const short *filter); | 66 const short *filter); |
91 | 67 |
92 void vp9_filter_block1d8_h8_intrin_ssse3(const unsigned char *src_ptr, | |
93 const unsigned int src_pitch, | |
94 unsigned char *output_ptr, | |
95 unsigned int out_pitch, | |
96 unsigned int output_height, | |
97 const short *filter); | |
98 | |
99 void vp9_filter_block1d4_v8_ssse3(const unsigned char *src_ptr, | 68 void vp9_filter_block1d4_v8_ssse3(const unsigned char *src_ptr, |
100 const unsigned int src_pitch, | 69 const unsigned int src_pitch, |
101 unsigned char *output_ptr, | 70 unsigned char *output_ptr, |
102 unsigned int out_pitch, | 71 unsigned int out_pitch, |
103 unsigned int output_height, | 72 unsigned int output_height, |
104 const short *filter); | 73 const short *filter); |
105 | 74 |
106 void vp9_filter_block1d4_v8_intrin_ssse3(const unsigned char *src_ptr, | |
107 const unsigned int src_pitch, | |
108 unsigned char *output_ptr, | |
109 unsigned int out_pitch, | |
110 unsigned int output_height, | |
111 const short *filter); | |
112 | |
113 | |
114 void vp9_filter_block1d4_h8_ssse3(const unsigned char *src_ptr, | 75 void vp9_filter_block1d4_h8_ssse3(const unsigned char *src_ptr, |
115 const unsigned int src_pitch, | 76 const unsigned int src_pitch, |
116 unsigned char *output_ptr, | 77 unsigned char *output_ptr, |
117 unsigned int out_pitch, | 78 unsigned int out_pitch, |
118 unsigned int output_height, | 79 unsigned int output_height, |
119 const short *filter); | 80 const short *filter); |
120 | 81 |
121 void vp9_filter_block1d4_h8_intrin_ssse3(const unsigned char *src_ptr, | |
122 const unsigned int src_pitch, | |
123 unsigned char *output_ptr, | |
124 unsigned int out_pitch, | |
125 unsigned int output_height, | |
126 const short *filter); | |
127 | |
128 void vp9_filter_block1d16_v8_avg_ssse3(const unsigned char *src_ptr, | 82 void vp9_filter_block1d16_v8_avg_ssse3(const unsigned char *src_ptr, |
129 const unsigned int src_pitch, | 83 const unsigned int src_pitch, |
130 unsigned char *output_ptr, | 84 unsigned char *output_ptr, |
131 unsigned int out_pitch, | 85 unsigned int out_pitch, |
132 unsigned int output_height, | 86 unsigned int output_height, |
133 const short *filter); | 87 const short *filter); |
134 | 88 |
135 void vp9_filter_block1d16_h8_avg_ssse3(const unsigned char *src_ptr, | 89 void vp9_filter_block1d16_h8_avg_ssse3(const unsigned char *src_ptr, |
136 const unsigned int src_pitch, | 90 const unsigned int src_pitch, |
137 unsigned char *output_ptr, | 91 unsigned char *output_ptr, |
(...skipping 30 matching lines...) Expand all Loading... |
168 const short *filter); | 122 const short *filter); |
169 | 123 |
170 void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, | 124 void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
171 uint8_t *dst, ptrdiff_t dst_stride, | 125 uint8_t *dst, ptrdiff_t dst_stride, |
172 const int16_t *filter_x, int x_step_q4, | 126 const int16_t *filter_x, int x_step_q4, |
173 const int16_t *filter_y, int y_step_q4, | 127 const int16_t *filter_y, int y_step_q4, |
174 int w, int h) { | 128 int w, int h) { |
175 /* Ensure the filter can be compressed to int16_t. */ | 129 /* Ensure the filter can be compressed to int16_t. */ |
176 if (x_step_q4 == 16 && filter_x[3] != 128) { | 130 if (x_step_q4 == 16 && filter_x[3] != 128) { |
177 while (w >= 16) { | 131 while (w >= 16) { |
178 vp9_filter_block1d16_h8_intrin_ssse3(src, src_stride, | 132 vp9_filter_block1d16_h8_ssse3(src, src_stride, |
179 dst, dst_stride, | 133 dst, dst_stride, |
180 h, filter_x); | 134 h, filter_x); |
181 src += 16; | 135 src += 16; |
182 dst += 16; | 136 dst += 16; |
183 w -= 16; | 137 w -= 16; |
184 } | 138 } |
185 while (w >= 8) { | 139 while (w >= 8) { |
186 vp9_filter_block1d8_h8_intrin_ssse3(src, src_stride, | 140 vp9_filter_block1d8_h8_ssse3(src, src_stride, |
187 dst, dst_stride, | 141 dst, dst_stride, |
188 h, filter_x); | 142 h, filter_x); |
189 src += 8; | 143 src += 8; |
190 dst += 8; | 144 dst += 8; |
191 w -= 8; | 145 w -= 8; |
192 } | 146 } |
193 while (w >= 4) { | 147 while (w >= 4) { |
194 vp9_filter_block1d4_h8_intrin_ssse3(src, src_stride, | 148 vp9_filter_block1d4_h8_ssse3(src, src_stride, |
195 dst, dst_stride, | 149 dst, dst_stride, |
196 h, filter_x); | 150 h, filter_x); |
197 src += 4; | 151 src += 4; |
198 dst += 4; | 152 dst += 4; |
199 w -= 4; | 153 w -= 4; |
200 } | 154 } |
201 } | 155 } |
202 if (w) { | 156 if (w) { |
203 vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, | 157 vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, |
204 filter_x, x_step_q4, filter_y, y_step_q4, | 158 filter_x, x_step_q4, filter_y, y_step_q4, |
205 w, h); | 159 w, h); |
206 } | 160 } |
207 } | 161 } |
208 | 162 |
209 void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, | 163 void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
210 uint8_t *dst, ptrdiff_t dst_stride, | 164 uint8_t *dst, ptrdiff_t dst_stride, |
211 const int16_t *filter_x, int x_step_q4, | 165 const int16_t *filter_x, int x_step_q4, |
212 const int16_t *filter_y, int y_step_q4, | 166 const int16_t *filter_y, int y_step_q4, |
213 int w, int h) { | 167 int w, int h) { |
214 if (y_step_q4 == 16 && filter_y[3] != 128) { | 168 if (y_step_q4 == 16 && filter_y[3] != 128) { |
215 while (w >= 16) { | 169 while (w >= 16) { |
216 vp9_filter_block1d16_v8_intrin_ssse3(src - src_stride * 3, src_stride, | 170 vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride, |
217 dst, dst_stride, | 171 dst, dst_stride, |
218 h, filter_y); | 172 h, filter_y); |
219 src += 16; | 173 src += 16; |
220 dst += 16; | 174 dst += 16; |
221 w -= 16; | 175 w -= 16; |
222 } | 176 } |
223 while (w >= 8) { | 177 while (w >= 8) { |
224 vp9_filter_block1d8_v8_intrin_ssse3(src - src_stride * 3, src_stride, | 178 vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride, |
225 dst, dst_stride, | 179 dst, dst_stride, |
226 h, filter_y); | 180 h, filter_y); |
227 src += 8; | 181 src += 8; |
228 dst += 8; | 182 dst += 8; |
229 w -= 8; | 183 w -= 8; |
230 } | 184 } |
231 while (w >= 4) { | 185 while (w >= 4) { |
232 vp9_filter_block1d4_v8_intrin_ssse3(src - src_stride * 3, src_stride, | 186 vp9_filter_block1d4_v8_ssse3(src - src_stride * 3, src_stride, |
233 dst, dst_stride, | 187 dst, dst_stride, |
234 h, filter_y); | 188 h, filter_y); |
235 src += 4; | 189 src += 4; |
236 dst += 4; | 190 dst += 4; |
237 w -= 4; | 191 w -= 4; |
238 } | 192 } |
239 } | 193 } |
240 if (w) { | 194 if (w) { |
241 vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, | 195 vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, |
242 filter_x, x_step_q4, filter_y, y_step_q4, | 196 filter_x, x_step_q4, filter_y, y_step_q4, |
243 w, h); | 197 w, h); |
244 } | 198 } |
245 } | 199 } |
246 | 200 |
247 void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, | 201 void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
248 uint8_t *dst, ptrdiff_t dst_stride, | 202 uint8_t *dst, ptrdiff_t dst_stride, |
249 const int16_t *filter_x, int x_step_q4, | 203 const int16_t *filter_x, int x_step_q4, |
250 const int16_t *filter_y, int y_step_q4, | 204 const int16_t *filter_y, int y_step_q4, |
251 int w, int h) { | 205 int w, int h) { |
252 if (x_step_q4 == 16 && filter_x[3] != 128) { | 206 if (x_step_q4 == 16 && filter_x[3] != 128) { |
253 while (w >= 16) { | 207 while (w >= 16) { |
254 vp9_filter_block1d16_h8_avg_ssse3(src, src_stride, | 208 vp9_filter_block1d16_h8_avg_ssse3(src, src_stride, |
255 dst, dst_stride, | 209 dst, dst_stride, |
256 h, filter_x); | 210 h, filter_x); |
257 src += 16; | 211 src += 16; |
258 dst += 16; | 212 dst += 16; |
259 w -= 16; | 213 w -= 16; |
260 } | 214 } |
261 while (w >= 8) { | 215 while (w >= 8) { |
262 vp9_filter_block1d8_h8_avg_ssse3(src, src_stride, | 216 vp9_filter_block1d8_h8_avg_ssse3(src, src_stride, |
263 dst, dst_stride, | 217 dst, dst_stride, |
264 h, filter_x); | 218 h, filter_x); |
265 src += 8; | 219 src += 8; |
266 dst += 8; | 220 dst += 8; |
267 w -= 8; | 221 w -= 8; |
268 } | 222 } |
269 while (w >= 4) { | 223 while (w >= 4) { |
270 vp9_filter_block1d4_h8_avg_ssse3(src, src_stride, | 224 vp9_filter_block1d4_h8_avg_ssse3(src, src_stride, |
271 dst, dst_stride, | 225 dst, dst_stride, |
272 h, filter_x); | 226 h, filter_x); |
273 src += 4; | 227 src += 4; |
274 dst += 4; | 228 dst += 4; |
275 w -= 4; | 229 w -= 4; |
276 } | 230 } |
277 } | 231 } |
278 if (w) { | 232 if (w) { |
279 vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, | 233 vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, |
280 filter_x, x_step_q4, filter_y, y_step_q4, | 234 filter_x, x_step_q4, filter_y, y_step_q4, |
281 w, h); | 235 w, h); |
282 } | 236 } |
283 } | 237 } |
284 | 238 |
285 void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, | 239 void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
286 uint8_t *dst, ptrdiff_t dst_stride, | 240 uint8_t *dst, ptrdiff_t dst_stride, |
287 const int16_t *filter_x, int x_step_q4, | 241 const int16_t *filter_x, int x_step_q4, |
288 const int16_t *filter_y, int y_step_q4, | 242 const int16_t *filter_y, int y_step_q4, |
289 int w, int h) { | 243 int w, int h) { |
290 if (y_step_q4 == 16 && filter_y[3] != 128) { | 244 if (y_step_q4 == 16 && filter_y[3] != 128) { |
291 while (w >= 16) { | 245 while (w >= 16) { |
292 vp9_filter_block1d16_v8_avg_ssse3(src - src_stride * 3, src_stride, | 246 vp9_filter_block1d16_v8_avg_ssse3(src - src_stride * 3, src_stride, |
293 dst, dst_stride, | 247 dst, dst_stride, |
294 h, filter_y); | 248 h, filter_y); |
295 src += 16; | 249 src += 16; |
296 dst += 16; | 250 dst += 16; |
297 w -= 16; | 251 w -= 16; |
298 } | 252 } |
299 while (w >= 8) { | 253 while (w >= 8) { |
300 vp9_filter_block1d8_v8_avg_ssse3(src - src_stride * 3, src_stride, | 254 vp9_filter_block1d8_v8_avg_ssse3(src - src_stride * 3, src_stride, |
301 dst, dst_stride, | 255 dst, dst_stride, |
302 h, filter_y); | 256 h, filter_y); |
303 src += 8; | 257 src += 8; |
304 dst += 8; | 258 dst += 8; |
305 w -= 8; | 259 w -= 8; |
306 } | 260 } |
307 while (w >= 4) { | 261 while (w >= 4) { |
308 vp9_filter_block1d4_v8_avg_ssse3(src - src_stride * 3, src_stride, | 262 vp9_filter_block1d4_v8_avg_ssse3(src - src_stride * 3, src_stride, |
309 dst, dst_stride, | 263 dst, dst_stride, |
310 h, filter_y); | 264 h, filter_y); |
311 src += 4; | 265 src += 4; |
312 dst += 4; | 266 dst += 4; |
313 w -= 4; | 267 w -= 4; |
314 } | 268 } |
315 } | 269 } |
316 if (w) { | 270 if (w) { |
317 vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, | 271 vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, |
318 filter_x, x_step_q4, filter_y, y_step_q4, | 272 filter_x, x_step_q4, filter_y, y_step_q4, |
319 w, h); | 273 w, h); |
320 } | 274 } |
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
356 w, h + 7); | 310 w, h + 7); |
357 vp9_convolve8_avg_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride, | 311 vp9_convolve8_avg_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride, |
358 filter_x, x_step_q4, filter_y, y_step_q4, | 312 filter_x, x_step_q4, filter_y, y_step_q4, |
359 w, h); | 313 w, h); |
360 } else { | 314 } else { |
361 vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, | 315 vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, |
362 filter_x, x_step_q4, filter_y, y_step_q4, w, h); | 316 filter_x, x_step_q4, filter_y, y_step_q4, w, h); |
363 } | 317 } |
364 } | 318 } |
365 #endif | 319 #endif |
OLD | NEW |