Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(139)

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_loopfilter_x86.c

Issue 11974002: libvpx: Pull from upstream (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 7 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include <emmintrin.h> // SSE2 11 #include <emmintrin.h> // SSE2
12 #include "vpx_config.h" 12 #include "vpx_config.h"
13 #include "vp9/common/vp9_loopfilter.h" 13 #include "vp9/common/vp9_loopfilter.h"
14 #include "vpx_ports/emmintrin_compat.h"
14 15
15 prototype_loopfilter(vp9_loop_filter_vertical_edge_mmx); 16 prototype_loopfilter(vp9_loop_filter_vertical_edge_mmx);
16 prototype_loopfilter(vp9_loop_filter_horizontal_edge_mmx); 17 prototype_loopfilter(vp9_loop_filter_horizontal_edge_mmx);
17 18
18 prototype_loopfilter(vp9_loop_filter_vertical_edge_sse2); 19 prototype_loopfilter(vp9_loop_filter_vertical_edge_sse2);
19 prototype_loopfilter(vp9_loop_filter_horizontal_edge_sse2); 20 prototype_loopfilter(vp9_loop_filter_horizontal_edge_sse2);
20 21
21 extern loop_filter_uvfunction vp9_loop_filter_horizontal_edge_uv_sse2; 22 extern loop_filter_uvfunction vp9_loop_filter_horizontal_edge_uv_sse2;
22 extern loop_filter_uvfunction vp9_loop_filter_vertical_edge_uv_sse2; 23 extern loop_filter_uvfunction vp9_loop_filter_vertical_edge_uv_sse2;
23 24
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after
77 78
78 void vp9_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, 79 void vp9_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride,
79 const unsigned char *blimit) { 80 const unsigned char *blimit) {
80 vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit); 81 vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit);
81 vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit); 82 vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit);
82 vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit); 83 vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit);
83 } 84 }
84 #endif 85 #endif
85 86
86 #if HAVE_SSE2 87 #if HAVE_SSE2
88
89 void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
90 int p,
91 const unsigned char *_blimit,
92 const unsigned char *_limit,
93 const unsigned char *_thresh) {
94 DECLARE_ALIGNED(16, unsigned char, flat2_op[7][16]);
95 DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][16]);
96
97 DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
98 DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
99 DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
100 DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
101 DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
102 DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
103 __m128i mask, hev, flat, flat2;
104 const __m128i zero = _mm_set1_epi16(0);
105 __m128i p7, p6, p5;
106 __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
107 __m128i q5, q6, q7;
108 int i = 0;
109 const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
110 const unsigned int extended_limit = _limit[0] * 0x01010101u;
111 const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
112 const __m128i thresh =
113 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0);
114 const __m128i limit =
115 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0);
116 const __m128i blimit =
117 _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
118
119 p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
120 p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
121 p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
122 p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
123 p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
124 q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
125 q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
126 q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
127 q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
128 q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
129 {
130 const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
131 _mm_subs_epu8(p0, p1));
132 const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
133 _mm_subs_epu8(q0, q1));
134 const __m128i one = _mm_set1_epi8(1);
135 const __m128i fe = _mm_set1_epi8(0xfe);
136 const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
137 __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
138 _mm_subs_epu8(q0, p0));
139 __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
140 _mm_subs_epu8(q1, p1));
141 __m128i work;
142 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
143 hev = _mm_subs_epu8(flat, thresh);
144 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
145
146 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
147 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
148 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
149 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
150 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
151 mask = _mm_max_epu8(flat, mask);
152 // mask |= (abs(p1 - p0) > limit) * -1;
153 // mask |= (abs(q1 - q0) > limit) * -1;
154 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
155 _mm_subs_epu8(p1, p2)),
156 _mm_or_si128(_mm_subs_epu8(p3, p2),
157 _mm_subs_epu8(p2, p3)));
158 mask = _mm_max_epu8(work, mask);
159 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
160 _mm_subs_epu8(q1, q2)),
161 _mm_or_si128(_mm_subs_epu8(q3, q2),
162 _mm_subs_epu8(q2, q3)));
163 mask = _mm_max_epu8(work, mask);
164 mask = _mm_subs_epu8(mask, limit);
165 mask = _mm_cmpeq_epi8(mask, zero);
166
167 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
168 _mm_subs_epu8(p0, p2)),
169 _mm_or_si128(_mm_subs_epu8(q2, q0),
170 _mm_subs_epu8(q0, q2)));
171 flat = _mm_max_epu8(work, flat);
172 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
173 _mm_subs_epu8(p0, p3)),
174 _mm_or_si128(_mm_subs_epu8(q3, q0),
175 _mm_subs_epu8(q0, q3)));
176 flat = _mm_max_epu8(work, flat);
177 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
178 _mm_subs_epu8(p0, p4)),
179 _mm_or_si128(_mm_subs_epu8(q4, q0),
180 _mm_subs_epu8(q0, q4)));
181 flat = _mm_max_epu8(work, flat);
182 flat = _mm_subs_epu8(flat, one);
183 flat = _mm_cmpeq_epi8(flat, zero);
184 flat = _mm_and_si128(flat, mask);
185 }
186
187 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
188 // calculate flat2
189 p4 = _mm_loadu_si128((__m128i *)(s - 8 * p));
190 p3 = _mm_loadu_si128((__m128i *)(s - 7 * p));
191 p2 = _mm_loadu_si128((__m128i *)(s - 6 * p));
192 p1 = _mm_loadu_si128((__m128i *)(s - 5 * p));
193 // p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
194 // q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
195 q1 = _mm_loadu_si128((__m128i *)(s + 4 * p));
196 q2 = _mm_loadu_si128((__m128i *)(s + 5 * p));
197 q3 = _mm_loadu_si128((__m128i *)(s + 6 * p));
198 q4 = _mm_loadu_si128((__m128i *)(s + 7 * p));
199
200 {
201 const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
202 _mm_subs_epu8(p0, p1));
203 const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
204 _mm_subs_epu8(q0, q1));
205 const __m128i one = _mm_set1_epi8(1);
206 __m128i work;
207 flat2 = _mm_max_epu8(abs_p1p0, abs_q1q0);
208 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
209 _mm_subs_epu8(p0, p2)),
210 _mm_or_si128(_mm_subs_epu8(q2, q0),
211 _mm_subs_epu8(q0, q2)));
212 flat2 = _mm_max_epu8(work, flat2);
213 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
214 _mm_subs_epu8(p0, p3)),
215 _mm_or_si128(_mm_subs_epu8(q3, q0),
216 _mm_subs_epu8(q0, q3)));
217 flat2 = _mm_max_epu8(work, flat2);
218 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
219 _mm_subs_epu8(p0, p4)),
220 _mm_or_si128(_mm_subs_epu8(q4, q0),
221 _mm_subs_epu8(q0, q4)));
222 flat2 = _mm_max_epu8(work, flat2);
223 flat2 = _mm_subs_epu8(flat2, one);
224 flat2 = _mm_cmpeq_epi8(flat2, zero);
225 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
226 }
227 // calculate flat2
228 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
229
230 {
231 const __m128i four = _mm_set1_epi16(4);
232 unsigned char *src = s;
233 i = 0;
234 do {
235 __m128i workp_a, workp_b, workp_shft;
236 p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero);
237 p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
238 p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
239 p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
240 p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
241 q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
242 q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
243 q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
244 q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
245 q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero);
246
247 workp_a = _mm_add_epi16(_mm_add_epi16(p4, p3), _mm_add_epi16(p2, p1));
248 workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
249 workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p4);
250 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
251 _mm_storel_epi64((__m128i *)&flat_op2[i*8],
252 _mm_packus_epi16(workp_shft, workp_shft));
253
254 workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
255 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
256 _mm_storel_epi64((__m128i *)&flat_op1[i*8],
257 _mm_packus_epi16(workp_shft, workp_shft));
258
259 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q2);
260 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
261 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
262 _mm_storel_epi64((__m128i *)&flat_op0[i*8],
263 _mm_packus_epi16(workp_shft, workp_shft));
264
265 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
266 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
267 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
268 _mm_storel_epi64((__m128i *)&flat_oq0[i*8],
269 _mm_packus_epi16(workp_shft, workp_shft));
270
271 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q4);
272 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
273 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
274 _mm_storel_epi64((__m128i *)&flat_oq1[i*8],
275 _mm_packus_epi16(workp_shft, workp_shft));
276
277 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q4);
278 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
279 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
280 _mm_storel_epi64((__m128i *)&flat_oq2[i*8],
281 _mm_packus_epi16(workp_shft, workp_shft));
282
283 src += 8;
284 } while (++i < 2);
285 }
286 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
287 // wide flat
288 // TODO(slavarnway): interleave with the flat pixel calculations (see above)
289 {
290 const __m128i eight = _mm_set1_epi16(8);
291 unsigned char *src = s;
292 int i = 0;
293 do {
294 __m128i workp_a, workp_b, workp_shft;
295 p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 8 * p)), zero);
296 p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 7 * p)), zero);
297 p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 6 * p)), zero);
298 p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero);
299 p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
300 p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
301 p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
302 p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
303 q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
304 q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
305 q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
306 q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
307 q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero);
308 q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 5 * p)), zero);
309 q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 6 * p)), zero);
310 q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 7 * p)), zero);
311
312
313 workp_a = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7); // p7 * 7
314 workp_a = _mm_add_epi16(_mm_slli_epi16(p6, 1), workp_a);
315 workp_b = _mm_add_epi16(_mm_add_epi16(p5, p4), _mm_add_epi16(p3, p2));
316 workp_a = _mm_add_epi16(_mm_add_epi16(p1, p0), workp_a);
317 workp_b = _mm_add_epi16(_mm_add_epi16(q0, eight), workp_b);
318 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
319 _mm_storel_epi64((__m128i *)&flat2_op[6][i*8],
320 _mm_packus_epi16(workp_shft, workp_shft));
321
322 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p5);
323 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p6), q1);
324 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
325 _mm_storel_epi64((__m128i *)&flat2_op[5][i*8],
326 _mm_packus_epi16(workp_shft, workp_shft));
327
328 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p4);
329 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p5), q2);
330 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
331 _mm_storel_epi64((__m128i *)&flat2_op[4][i*8],
332 _mm_packus_epi16(workp_shft, workp_shft));
333
334 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p3);
335 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p4), q3);
336 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
337 _mm_storel_epi64((__m128i *)&flat2_op[3][i*8],
338 _mm_packus_epi16(workp_shft, workp_shft));
339
340 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p2);
341 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p3), q4);
342 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
343 _mm_storel_epi64((__m128i *)&flat2_op[2][i*8],
344 _mm_packus_epi16(workp_shft, workp_shft));
345
346 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p1);
347 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p2), q5);
348 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
349 _mm_storel_epi64((__m128i *)&flat2_op[1][i*8],
350 _mm_packus_epi16(workp_shft, workp_shft));
351
352 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p0);
353 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), q6);
354 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
355 _mm_storel_epi64((__m128i *)&flat2_op[0][i*8],
356 _mm_packus_epi16(workp_shft, workp_shft));
357
358 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), q0);
359 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q7);
360 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
361 _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8],
362 _mm_packus_epi16(workp_shft, workp_shft));
363
364 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p6), q1);
365 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q7);
366 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
367 _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8],
368 _mm_packus_epi16(workp_shft, workp_shft));
369
370 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p5), q2);
371 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q7);
372 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
373 _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8],
374 _mm_packus_epi16(workp_shft, workp_shft));
375
376 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q3);
377 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q2), q7);
378 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
379 _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8],
380 _mm_packus_epi16(workp_shft, workp_shft));
381
382 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q4);
383 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q3), q7);
384 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
385 _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8],
386 _mm_packus_epi16(workp_shft, workp_shft));
387
388 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q5);
389 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q4), q7);
390 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
391 _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8],
392 _mm_packus_epi16(workp_shft, workp_shft));
393
394 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q6);
395 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q5), q7);
396 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
397 _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8],
398 _mm_packus_epi16(workp_shft, workp_shft));
399
400 src += 8;
401 } while (++i < 2);
402 }
403 // wide flat
404 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
405
406 // lp filter
407 {
408 const __m128i t4 = _mm_set1_epi8(4);
409 const __m128i t3 = _mm_set1_epi8(3);
410 const __m128i t80 = _mm_set1_epi8(0x80);
411 const __m128i te0 = _mm_set1_epi8(0xe0);
412 const __m128i t1f = _mm_set1_epi8(0x1f);
413 const __m128i t1 = _mm_set1_epi8(0x1);
414 const __m128i t7f = _mm_set1_epi8(0x7f);
415
416 __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
417 t80);
418 __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
419 t80);
420 __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
421 t80);
422 __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
423 t80);
424 __m128i filt;
425 __m128i work_a;
426 __m128i filter1, filter2;
427
428 filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
429 work_a = _mm_subs_epi8(qs0, ps0);
430 filt = _mm_adds_epi8(filt, work_a);
431 filt = _mm_adds_epi8(filt, work_a);
432 filt = _mm_adds_epi8(filt, work_a);
433 /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
434 filt = _mm_and_si128(filt, mask);
435
436 filter1 = _mm_adds_epi8(filt, t4);
437 filter2 = _mm_adds_epi8(filt, t3);
438
439 /* Filter1 >> 3 */
440 work_a = _mm_cmpgt_epi8(zero, filter1);
441 filter1 = _mm_srli_epi16(filter1, 3);
442 work_a = _mm_and_si128(work_a, te0);
443 filter1 = _mm_and_si128(filter1, t1f);
444 filter1 = _mm_or_si128(filter1, work_a);
445
446 /* Filter2 >> 3 */
447 work_a = _mm_cmpgt_epi8(zero, filter2);
448 filter2 = _mm_srli_epi16(filter2, 3);
449 work_a = _mm_and_si128(work_a, te0);
450 filter2 = _mm_and_si128(filter2, t1f);
451 filter2 = _mm_or_si128(filter2, work_a);
452
453 /* filt >> 1 */
454 filt = _mm_adds_epi8(filter1, t1);
455 work_a = _mm_cmpgt_epi8(zero, filt);
456 filt = _mm_srli_epi16(filt, 1);
457 work_a = _mm_and_si128(work_a, t80);
458 filt = _mm_and_si128(filt, t7f);
459 filt = _mm_or_si128(filt, work_a);
460
461 filt = _mm_andnot_si128(hev, filt);
462
463 ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
464 ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
465 qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
466 qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
467
468 // write out op6 - op3
469 {
470 unsigned char *dst = (s - 7 * p);
471 for (i = 6; i > 2; i--) {
472 __m128i flat2_output;
473 work_a = _mm_loadu_si128((__m128i *)dst);
474 flat2_output = _mm_load_si128((__m128i *)flat2_op[i]);
475 work_a = _mm_andnot_si128(flat2, work_a);
476 flat2_output = _mm_and_si128(flat2, flat2_output);
477 work_a = _mm_or_si128(work_a, flat2_output);
478 _mm_storeu_si128((__m128i *)dst, work_a);
479 dst += p;
480 }
481 }
482
483 work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
484 p2 = _mm_load_si128((__m128i *)flat_op2);
485 work_a = _mm_andnot_si128(flat, work_a);
486 p2 = _mm_and_si128(flat, p2);
487 work_a = _mm_or_si128(work_a, p2);
488 p2 = _mm_load_si128((__m128i *)flat2_op[2]);
489 work_a = _mm_andnot_si128(flat2, work_a);
490 p2 = _mm_and_si128(flat2, p2);
491 p2 = _mm_or_si128(work_a, p2);
492 _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
493
494 p1 = _mm_load_si128((__m128i *)flat_op1);
495 work_a = _mm_andnot_si128(flat, ps1);
496 p1 = _mm_and_si128(flat, p1);
497 work_a = _mm_or_si128(work_a, p1);
498 p1 = _mm_load_si128((__m128i *)flat2_op[1]);
499 work_a = _mm_andnot_si128(flat2, work_a);
500 p1 = _mm_and_si128(flat2, p1);
501 p1 = _mm_or_si128(work_a, p1);
502 _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
503
504 p0 = _mm_load_si128((__m128i *)flat_op0);
505 work_a = _mm_andnot_si128(flat, ps0);
506 p0 = _mm_and_si128(flat, p0);
507 work_a = _mm_or_si128(work_a, p0);
508 p0 = _mm_load_si128((__m128i *)flat2_op[0]);
509 work_a = _mm_andnot_si128(flat2, work_a);
510 p0 = _mm_and_si128(flat2, p0);
511 p0 = _mm_or_si128(work_a, p0);
512 _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
513
514 q0 = _mm_load_si128((__m128i *)flat_oq0);
515 work_a = _mm_andnot_si128(flat, qs0);
516 q0 = _mm_and_si128(flat, q0);
517 work_a = _mm_or_si128(work_a, q0);
518 q0 = _mm_load_si128((__m128i *)flat2_oq[0]);
519 work_a = _mm_andnot_si128(flat2, work_a);
520 q0 = _mm_and_si128(flat2, q0);
521 q0 = _mm_or_si128(work_a, q0);
522 _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
523
524 q1 = _mm_load_si128((__m128i *)flat_oq1);
525 work_a = _mm_andnot_si128(flat, qs1);
526 q1 = _mm_and_si128(flat, q1);
527 work_a = _mm_or_si128(work_a, q1);
528 q1 = _mm_load_si128((__m128i *)flat2_oq[1]);
529 work_a = _mm_andnot_si128(flat2, work_a);
530 q1 = _mm_and_si128(flat2, q1);
531 q1 = _mm_or_si128(work_a, q1);
532 _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
533
534 work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
535 q2 = _mm_load_si128((__m128i *)flat_oq2);
536 work_a = _mm_andnot_si128(flat, work_a);
537 q2 = _mm_and_si128(flat, q2);
538 work_a = _mm_or_si128(work_a, q2);
539 q2 = _mm_load_si128((__m128i *)flat2_oq[2]);
540 work_a = _mm_andnot_si128(flat2, work_a);
541 q2 = _mm_and_si128(flat2, q2);
542 q2 = _mm_or_si128(work_a, q2);
543 _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
544
545 // write out oq3 - oq7
546 {
547 unsigned char *dst = (s + 3 * p);
548 for (i = 3; i < 7; i++) {
549 __m128i flat2_output;
550 work_a = _mm_loadu_si128((__m128i *)dst);
551 flat2_output = _mm_load_si128((__m128i *)flat2_oq[i]);
552 work_a = _mm_andnot_si128(flat2, work_a);
553 flat2_output = _mm_and_si128(flat2, flat2_output);
554 work_a = _mm_or_si128(work_a, flat2_output);
555 _mm_storeu_si128((__m128i *)dst, work_a);
556 dst += p;
557 }
558 }
559 }
560 }
561
87 void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, 562 void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
88 int p, 563 int p,
89 const unsigned char *_blimit, 564 const unsigned char *_blimit,
90 const unsigned char *_limit, 565 const unsigned char *_limit,
91 const unsigned char *_thresh) { 566 const unsigned char *_thresh) {
92 DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); 567 DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
93 DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); 568 DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
94 DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); 569 DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
95 DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); 570 DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
96 DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); 571 DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
(...skipping 457 matching lines...) Expand 10 before | Expand all | Expand 10 after
554 src[0] = t_dst + 3 * 16; 1029 src[0] = t_dst + 3 * 16;
555 src[1] = t_dst + 3 * 16 + 8; 1030 src[1] = t_dst + 3 * 16 + 8;
556 1031
557 dst[0] = s - 5; 1032 dst[0] = s - 5;
558 dst[1] = s - 5 + p * 8; 1033 dst[1] = s - 5 + p * 8;
559 1034
560 /* Transpose 16x8 */ 1035 /* Transpose 16x8 */
561 transpose(src, 16, dst, p, 2); 1036 transpose(src, 16, dst, p, 2);
562 } 1037 }
563 1038
1039 void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s,
1040 int p,
1041 const unsigned char *blimit,
1042 const unsigned char *limit,
1043 const unsigned char *thresh) {
1044 DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
1045 unsigned char *src[4];
1046 unsigned char *dst[4];
1047
1048 /* Transpose 16x16 */
1049 transpose8x16(s - 8, s - 8 + p * 8, p, t_dst, 16);
1050 transpose8x16(s, s + p * 8, p, t_dst + 16 * 8, 16);
1051
1052 /* Loop filtering */
1053 vp9_mb_lpf_horizontal_edge_w_sse2(t_dst + 8 * 16, 16, blimit, limit,
1054 thresh);
1055
1056 src[0] = t_dst;
1057 src[1] = t_dst + 8 * 16;
1058 src[2] = t_dst + 8;
1059 src[3] = t_dst + 8 * 16 + 8;
1060
1061 dst[0] = s - 8;
1062 dst[1] = s - 8 + 8;
1063 dst[2] = s - 8 + p * 8;
1064 dst[3] = s - 8 + p * 8 + 8;
1065
1066 /* Transpose 16x16 */
1067 transpose(src, 16, dst, p, 4);
1068 }
1069
1070
564 void vp9_mbloop_filter_vertical_edge_uv_sse2(unsigned char *u, 1071 void vp9_mbloop_filter_vertical_edge_uv_sse2(unsigned char *u,
565 int p, 1072 int p,
566 const unsigned char *blimit, 1073 const unsigned char *blimit,
567 const unsigned char *limit, 1074 const unsigned char *limit,
568 const unsigned char *thresh, 1075 const unsigned char *thresh,
569 unsigned char *v) { 1076 unsigned char *v) {
570 DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256); 1077 DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
571 unsigned char *src[2]; 1078 unsigned char *src[2];
572 unsigned char *dst[2]; 1079 unsigned char *dst[2];
573 1080
(...skipping 22 matching lines...) Expand all
596 struct loop_filter_info *lfi) { 1103 struct loop_filter_info *lfi) {
597 vp9_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim, 1104 vp9_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim,
598 lfi->lim, lfi->hev_thr); 1105 lfi->lim, lfi->hev_thr);
599 1106
600 /* u,v */ 1107 /* u,v */
601 if (u_ptr) 1108 if (u_ptr)
602 vp9_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, 1109 vp9_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim,
603 lfi->lim, lfi->hev_thr, v_ptr); 1110 lfi->lim, lfi->hev_thr, v_ptr);
604 } 1111 }
605 1112
1113
1114 void vp9_lpf_mbh_w_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
1115 unsigned char *v_ptr, int y_stride, int uv_stride,
1116 struct loop_filter_info *lfi) {
1117 vp9_mb_lpf_horizontal_edge_w_sse2(y_ptr, y_stride,
1118 lfi->mblim, lfi->lim, lfi->hev_thr);
1119
1120 /* u,v */
1121 if (u_ptr)
1122 vp9_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim,
1123 lfi->lim, lfi->hev_thr, v_ptr);
1124 }
1125
1126
606 void vp9_loop_filter_bh8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr, 1127 void vp9_loop_filter_bh8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
607 unsigned char *v_ptr, int y_stride, int uv_stride, 1128 unsigned char *v_ptr, int y_stride, int uv_stride,
608 struct loop_filter_info *lfi) { 1129 struct loop_filter_info *lfi) {
609 vp9_mbloop_filter_horizontal_edge_sse2( 1130 vp9_mbloop_filter_horizontal_edge_sse2(
610 y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); 1131 y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
1132
1133 if (u_ptr)
1134 vp9_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride,
1135 lfi->blim, lfi->lim, lfi->hev_thr,
1136 v_ptr + 4 * uv_stride);
611 } 1137 }
612 1138
613 /* Vertical MB Filtering */ 1139 /* Vertical MB Filtering */
614 void vp9_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, 1140 void vp9_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
615 unsigned char *v_ptr, int y_stride, int uv_stride, 1141 unsigned char *v_ptr, int y_stride, int uv_stride,
616 struct loop_filter_info *lfi) { 1142 struct loop_filter_info *lfi) {
617 vp9_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, 1143 vp9_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim,
618 lfi->hev_thr); 1144 lfi->hev_thr);
619 1145
620 /* u,v */ 1146 /* u,v */
621 if (u_ptr) 1147 if (u_ptr)
622 vp9_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, 1148 vp9_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim,
623 lfi->lim, lfi->hev_thr, v_ptr); 1149 lfi->lim, lfi->hev_thr, v_ptr);
624 } 1150 }
625 1151
1152
1153 void vp9_lpf_mbv_w_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
1154 unsigned char *v_ptr, int y_stride, int uv_stride,
1155 struct loop_filter_info *lfi) {
1156 vp9_mb_lpf_vertical_edge_w_sse2(y_ptr, y_stride,
1157 lfi->mblim, lfi->lim, lfi->hev_thr);
1158
1159 /* u,v */
1160 if (u_ptr)
1161 vp9_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim,
1162 lfi->lim, lfi->hev_thr, v_ptr);
1163 }
1164
1165
626 void vp9_loop_filter_bv8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr, 1166 void vp9_loop_filter_bv8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
627 unsigned char *v_ptr, int y_stride, int uv_stride, 1167 unsigned char *v_ptr, int y_stride, int uv_stride,
628 struct loop_filter_info *lfi) { 1168 struct loop_filter_info *lfi) {
629 vp9_mbloop_filter_vertical_edge_sse2( 1169 vp9_mbloop_filter_vertical_edge_sse2(
630 y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); 1170 y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
1171
1172 if (u_ptr)
1173 vp9_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride,
1174 lfi->blim, lfi->lim, lfi->hev_thr,
1175 v_ptr + 4);
631 } 1176 }
632 1177
633 /* Horizontal B Filtering */ 1178 /* Horizontal B Filtering */
634 void vp9_loop_filter_bh_sse2(unsigned char *y_ptr, 1179 void vp9_loop_filter_bh_sse2(unsigned char *y_ptr,
635 unsigned char *u_ptr, unsigned char *v_ptr, 1180 unsigned char *u_ptr, unsigned char *v_ptr,
636 int y_stride, int uv_stride, 1181 int y_stride, int uv_stride,
637 struct loop_filter_info *lfi) { 1182 struct loop_filter_info *lfi) {
638 vp9_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, 1183 vp9_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride,
639 lfi->blim, lfi->lim, lfi->hev_thr, 2); 1184 lfi->blim, lfi->lim, lfi->hev_thr, 2);
640 vp9_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, 1185 vp9_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride,
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after
677 } 1222 }
678 1223
679 void vp9_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, 1224 void vp9_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride,
680 const unsigned char *blimit) { 1225 const unsigned char *blimit) {
681 vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit); 1226 vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit);
682 vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit); 1227 vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit);
683 vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit); 1228 vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit);
684 } 1229 }
685 1230
686 #endif 1231 #endif
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/x86/vp9_filter_sse2.c ('k') | source/libvpx/vp9/common/x86/vp9_recon_wrapper_sse2.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698