Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(665)

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c

Issue 592203002: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include <emmintrin.h> // SSE2 11 #include <emmintrin.h> // SSE2
12 #include "vp9/common/vp9_loopfilter.h" 12 #include "vp9/common/vp9_loopfilter.h"
13 #include "vpx_ports/emmintrin_compat.h" 13 #include "vpx_ports/emmintrin_compat.h"
14 14
15 static INLINE __m128i abs_diff(__m128i a, __m128i b) {
16 return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
17 }
18
15 static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s, 19 static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
16 int p, 20 int p,
17 const unsigned char *_blimit, 21 const unsigned char *_blimit,
18 const unsigned char *_limit, 22 const unsigned char *_limit,
19 const unsigned char *_thresh) { 23 const unsigned char *_thresh) {
20 const __m128i zero = _mm_set1_epi16(0); 24 const __m128i zero = _mm_set1_epi16(0);
21 const __m128i one = _mm_set1_epi8(1); 25 const __m128i one = _mm_set1_epi8(1);
22 const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); 26 const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
23 const __m128i limit = _mm_load_si128((const __m128i *)_limit); 27 const __m128i limit = _mm_load_si128((const __m128i *)_limit);
24 const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); 28 const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
(...skipping 14 matching lines...) Expand all
39 q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1), 43 q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1),
40 (__m64 *)(s + 1 * p))); 44 (__m64 *)(s + 1 * p)));
41 p1q1 = _mm_shuffle_epi32(q1p1, 78); 45 p1q1 = _mm_shuffle_epi32(q1p1, 78);
42 q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); 46 q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
43 q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0), 47 q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0),
44 (__m64 *)(s - 0 * p))); 48 (__m64 *)(s - 0 * p)));
45 p0q0 = _mm_shuffle_epi32(q0p0, 78); 49 p0q0 = _mm_shuffle_epi32(q0p0, 78);
46 50
47 { 51 {
48 __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; 52 __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
49 abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), 53 abs_p1p0 = abs_diff(q1p1, q0p0);
50 _mm_subs_epu8(q0p0, q1p1));
51 abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); 54 abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
52 fe = _mm_set1_epi8(0xfe); 55 fe = _mm_set1_epi8(0xfe);
53 ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); 56 ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
54 abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), 57 abs_p0q0 = abs_diff(q0p0, p0q0);
55 _mm_subs_epu8(p0q0, q0p0)); 58 abs_p1q1 = abs_diff(q1p1, p1q1);
56 abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
57 _mm_subs_epu8(p1q1, q1p1));
58 flat = _mm_max_epu8(abs_p1p0, abs_q1q0); 59 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
59 hev = _mm_subs_epu8(flat, thresh); 60 hev = _mm_subs_epu8(flat, thresh);
60 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); 61 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
61 62
62 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); 63 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
63 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); 64 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
64 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); 65 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
65 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); 66 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
66 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; 67 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
67 mask = _mm_max_epu8(abs_p1p0, mask); 68 mask = _mm_max_epu8(abs_p1p0, mask);
68 // mask |= (abs(p1 - p0) > limit) * -1; 69 // mask |= (abs(p1 - p0) > limit) * -1;
69 // mask |= (abs(q1 - q0) > limit) * -1; 70 // mask |= (abs(q1 - q0) > limit) * -1;
70 71
71 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q1p1), 72 work = _mm_max_epu8(abs_diff(q2p2, q1p1),
72 _mm_subs_epu8(q1p1, q2p2)), 73 abs_diff(q3p3, q2p2));
73 _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
74 _mm_subs_epu8(q2p2, q3p3)));
75 mask = _mm_max_epu8(work, mask); 74 mask = _mm_max_epu8(work, mask);
76 mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); 75 mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
77 mask = _mm_subs_epu8(mask, limit); 76 mask = _mm_subs_epu8(mask, limit);
78 mask = _mm_cmpeq_epi8(mask, zero); 77 mask = _mm_cmpeq_epi8(mask, zero);
79 } 78 }
80 79
81 // lp filter 80 // lp filter
82 { 81 {
83 const __m128i t4 = _mm_set1_epi8(4); 82 const __m128i t4 = _mm_set1_epi8(4);
84 const __m128i t3 = _mm_set1_epi8(3); 83 const __m128i t3 = _mm_set1_epi8(3);
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after
118 filt = _mm_adds_epi16(filter1, t1); 117 filt = _mm_adds_epi16(filter1, t1);
119 filt = _mm_srai_epi16(filt, 1); 118 filt = _mm_srai_epi16(filt, 1);
120 filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), 119 filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
121 filt); 120 filt);
122 filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); 121 filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
123 qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); 122 qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
124 // loopfilter done 123 // loopfilter done
125 124
126 { 125 {
127 __m128i work; 126 __m128i work;
128 flat = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q0p0), 127 flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
129 _mm_subs_epu8(q0p0, q2p2)),
130 _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
131 _mm_subs_epu8(q0p0, q3p3)));
132 flat = _mm_max_epu8(abs_p1p0, flat); 128 flat = _mm_max_epu8(abs_p1p0, flat);
133 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); 129 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
134 flat = _mm_subs_epu8(flat, one); 130 flat = _mm_subs_epu8(flat, one);
135 flat = _mm_cmpeq_epi8(flat, zero); 131 flat = _mm_cmpeq_epi8(flat, zero);
136 flat = _mm_and_si128(flat, mask); 132 flat = _mm_and_si128(flat, mask);
137 133
138 q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); 134 q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
139 q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5), 135 q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5),
140 (__m64 *)(s + 5 * p))); 136 (__m64 *)(s + 5 * p)));
141 137
142 q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); 138 q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
143 q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6), 139 q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6),
144 (__m64 *)(s + 6 * p))); 140 (__m64 *)(s + 6 * p)));
145 141 flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
146 flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
147 _mm_subs_epu8(q0p0, q4p4)),
148 _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
149 _mm_subs_epu8(q0p0, q5p5)));
150 142
151 q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); 143 q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
152 q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7), 144 q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7),
153 (__m64 *)(s + 7 * p))); 145 (__m64 *)(s + 7 * p)));
154 146 work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
155 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
156 _mm_subs_epu8(q0p0, q6p6)),
157 _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
158 _mm_subs_epu8(q0p0, q7p7)));
159
160 flat2 = _mm_max_epu8(work, flat2); 147 flat2 = _mm_max_epu8(work, flat2);
161 flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); 148 flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
162 flat2 = _mm_subs_epu8(flat2, one); 149 flat2 = _mm_subs_epu8(flat2, one);
163 flat2 = _mm_cmpeq_epi8(flat2, zero); 150 flat2 = _mm_cmpeq_epi8(flat2, zero);
164 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask 151 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
165 } 152 }
166 153
167 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 154 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
168 // flat and wide flat calculations 155 // flat and wide flat calculations
169 { 156 {
(...skipping 187 matching lines...) Expand 10 before | Expand all | Expand 10 after
357 _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); 344 _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
358 345
359 q0p0 = _mm_andnot_si128(flat2, q0p0); 346 q0p0 = _mm_andnot_si128(flat2, q0p0);
360 flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); 347 flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
361 q0p0 = _mm_or_si128(q0p0, flat2_q0p0); 348 q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
362 _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); 349 _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
363 _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); 350 _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
364 } 351 }
365 } 352 }
366 353
354 static INLINE __m128i filter_add2_sub2(const __m128i *const total,
355 const __m128i *const a1,
356 const __m128i *const a2,
357 const __m128i *const s1,
358 const __m128i *const s2) {
359 __m128i x = _mm_add_epi16(*a1, *total);
360 x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
361 return x;
362 }
363
364 static INLINE __m128i filter8_mask(const __m128i *const flat,
365 const __m128i *const other_filt,
366 const __m128i *const f8_lo,
367 const __m128i *const f8_hi) {
368 const __m128i f8 = _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3),
369 _mm_srli_epi16(*f8_hi, 3));
370 const __m128i result = _mm_and_si128(*flat, f8);
371 return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
372 }
373
374 static INLINE __m128i filter16_mask(const __m128i *const flat,
375 const __m128i *const other_filt,
376 const __m128i *const f_lo,
377 const __m128i *const f_hi) {
378 const __m128i f = _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4),
379 _mm_srli_epi16(*f_hi, 4));
380 const __m128i result = _mm_and_si128(*flat, f);
381 return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
382 }
383
367 static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, 384 static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
368 int p, 385 int p,
369 const unsigned char *_blimit, 386 const unsigned char *_blimit,
370 const unsigned char *_limit, 387 const unsigned char *_limit,
371 const unsigned char *_thresh) { 388 const unsigned char *_thresh) {
372 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat2_op, 7 * 16);
373 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat2_oq, 7 * 16);
374
375 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op, 3 * 16);
376 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq, 3 * 16);
377
378 DECLARE_ALIGNED_ARRAY(16, unsigned char, ap, 8 * 16);
379 DECLARE_ALIGNED_ARRAY(16, unsigned char, aq, 8 * 16);
380
381 const __m128i zero = _mm_set1_epi16(0); 389 const __m128i zero = _mm_set1_epi16(0);
382 const __m128i one = _mm_set1_epi8(1); 390 const __m128i one = _mm_set1_epi8(1);
383 const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); 391 const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
384 const __m128i limit = _mm_load_si128((const __m128i *)_limit); 392 const __m128i limit = _mm_load_si128((const __m128i *)_limit);
385 const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); 393 const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
386 __m128i mask, hev, flat, flat2; 394 __m128i mask, hev, flat, flat2;
387 __m128i p7, p6, p5; 395 __m128i p7, p6, p5;
388 __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; 396 __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
389 __m128i q5, q6, q7; 397 __m128i q5, q6, q7;
390 int i = 0;
391 398
399 __m128i op2, op1, op0, oq0, oq1, oq2;
400
401 __m128i max_abs_p1p0q1q0;
402
403 p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
404 p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
405 p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
392 p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); 406 p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
393 p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); 407 p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
394 p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); 408 p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
395 p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); 409 p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
396 p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); 410 p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
397 q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); 411 q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
398 q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); 412 q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
399 q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); 413 q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
400 q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); 414 q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
401 q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); 415 q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
402 416 q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
403 _mm_store_si128((__m128i *)&ap[4 * 16], p4); 417 q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
404 _mm_store_si128((__m128i *)&ap[3 * 16], p3); 418 q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
405 _mm_store_si128((__m128i *)&ap[2 * 16], p2);
406 _mm_store_si128((__m128i *)&ap[1 * 16], p1);
407 _mm_store_si128((__m128i *)&ap[0 * 16], p0);
408 _mm_store_si128((__m128i *)&aq[4 * 16], q4);
409 _mm_store_si128((__m128i *)&aq[3 * 16], q3);
410 _mm_store_si128((__m128i *)&aq[2 * 16], q2);
411 _mm_store_si128((__m128i *)&aq[1 * 16], q1);
412 _mm_store_si128((__m128i *)&aq[0 * 16], q0);
413
414 419
415 { 420 {
416 const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), 421 const __m128i abs_p1p0 = abs_diff(p1, p0);
417 _mm_subs_epu8(p0, p1)); 422 const __m128i abs_q1q0 = abs_diff(q1, q0);
418 const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
419 _mm_subs_epu8(q0, q1));
420 const __m128i fe = _mm_set1_epi8(0xfe); 423 const __m128i fe = _mm_set1_epi8(0xfe);
421 const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); 424 const __m128i ff = _mm_cmpeq_epi8(zero, zero);
422 __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), 425 __m128i abs_p0q0 = abs_diff(p0, q0);
423 _mm_subs_epu8(q0, p0)); 426 __m128i abs_p1q1 = abs_diff(p1, q1);
424 __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
425 _mm_subs_epu8(q1, p1));
426 __m128i work; 427 __m128i work;
427 flat = _mm_max_epu8(abs_p1p0, abs_q1q0); 428 max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
428 hev = _mm_subs_epu8(flat, thresh);
429 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
430 429
431 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); 430 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
432 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); 431 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
433 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); 432 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
434 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); 433 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
435 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; 434 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
436 mask = _mm_max_epu8(flat, mask); 435 mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
437 // mask |= (abs(p1 - p0) > limit) * -1; 436 // mask |= (abs(p1 - p0) > limit) * -1;
438 // mask |= (abs(q1 - q0) > limit) * -1; 437 // mask |= (abs(q1 - q0) > limit) * -1;
439 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1), 438 work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
440 _mm_subs_epu8(p1, p2)),
441 _mm_or_si128(_mm_subs_epu8(p3, p2),
442 _mm_subs_epu8(p2, p3)));
443 mask = _mm_max_epu8(work, mask); 439 mask = _mm_max_epu8(work, mask);
444 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1), 440 work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
445 _mm_subs_epu8(q1, q2)),
446 _mm_or_si128(_mm_subs_epu8(q3, q2),
447 _mm_subs_epu8(q2, q3)));
448 mask = _mm_max_epu8(work, mask); 441 mask = _mm_max_epu8(work, mask);
449 mask = _mm_subs_epu8(mask, limit); 442 mask = _mm_subs_epu8(mask, limit);
450 mask = _mm_cmpeq_epi8(mask, zero); 443 mask = _mm_cmpeq_epi8(mask, zero);
451 } 444 }
452 445
453 // lp filter 446 {
447 __m128i work;
448 work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
449 flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
450 work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
451 flat = _mm_max_epu8(work, flat);
452 work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
453 flat = _mm_subs_epu8(flat, one);
454 flat = _mm_cmpeq_epi8(flat, zero);
455 flat = _mm_and_si128(flat, mask);
456 flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
457 flat2 = _mm_max_epu8(work, flat2);
458 work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
459 flat2 = _mm_max_epu8(work, flat2);
460 work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0));
461 flat2 = _mm_max_epu8(work, flat2);
462 flat2 = _mm_subs_epu8(flat2, one);
463 flat2 = _mm_cmpeq_epi8(flat2, zero);
464 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
465 }
466
467 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
468 // filter4
454 { 469 {
455 const __m128i t4 = _mm_set1_epi8(4); 470 const __m128i t4 = _mm_set1_epi8(4);
456 const __m128i t3 = _mm_set1_epi8(3); 471 const __m128i t3 = _mm_set1_epi8(3);
457 const __m128i t80 = _mm_set1_epi8(0x80); 472 const __m128i t80 = _mm_set1_epi8(0x80);
458 const __m128i te0 = _mm_set1_epi8(0xe0); 473 const __m128i te0 = _mm_set1_epi8(0xe0);
459 const __m128i t1f = _mm_set1_epi8(0x1f); 474 const __m128i t1f = _mm_set1_epi8(0x1f);
460 const __m128i t1 = _mm_set1_epi8(0x1); 475 const __m128i t1 = _mm_set1_epi8(0x1);
461 const __m128i t7f = _mm_set1_epi8(0x7f); 476 const __m128i t7f = _mm_set1_epi8(0x7f);
477 const __m128i ff = _mm_cmpeq_epi8(t4, t4);
462 478
463 __m128i ps1 = _mm_xor_si128(p1, t80);
464 __m128i ps0 = _mm_xor_si128(p0, t80);
465 __m128i qs0 = _mm_xor_si128(q0, t80);
466 __m128i qs1 = _mm_xor_si128(q1, t80);
467 __m128i filt; 479 __m128i filt;
468 __m128i work_a; 480 __m128i work_a;
469 __m128i filter1, filter2; 481 __m128i filter1, filter2;
470 482
471 filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); 483 op1 = _mm_xor_si128(p1, t80);
472 work_a = _mm_subs_epi8(qs0, ps0); 484 op0 = _mm_xor_si128(p0, t80);
485 oq0 = _mm_xor_si128(q0, t80);
486 oq1 = _mm_xor_si128(q1, t80);
487
488 hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
489 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
490 filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
491
492 work_a = _mm_subs_epi8(oq0, op0);
473 filt = _mm_adds_epi8(filt, work_a); 493 filt = _mm_adds_epi8(filt, work_a);
474 filt = _mm_adds_epi8(filt, work_a); 494 filt = _mm_adds_epi8(filt, work_a);
475 filt = _mm_adds_epi8(filt, work_a); 495 filt = _mm_adds_epi8(filt, work_a);
476 // (vp9_filter + 3 * (qs0 - ps0)) & mask 496 // (vp9_filter + 3 * (qs0 - ps0)) & mask
477 filt = _mm_and_si128(filt, mask); 497 filt = _mm_and_si128(filt, mask);
478
479 filter1 = _mm_adds_epi8(filt, t4); 498 filter1 = _mm_adds_epi8(filt, t4);
480 filter2 = _mm_adds_epi8(filt, t3); 499 filter2 = _mm_adds_epi8(filt, t3);
481 500
482 // Filter1 >> 3 501 // Filter1 >> 3
483 work_a = _mm_cmpgt_epi8(zero, filter1); 502 work_a = _mm_cmpgt_epi8(zero, filter1);
484 filter1 = _mm_srli_epi16(filter1, 3); 503 filter1 = _mm_srli_epi16(filter1, 3);
485 work_a = _mm_and_si128(work_a, te0); 504 work_a = _mm_and_si128(work_a, te0);
486 filter1 = _mm_and_si128(filter1, t1f); 505 filter1 = _mm_and_si128(filter1, t1f);
487 filter1 = _mm_or_si128(filter1, work_a); 506 filter1 = _mm_or_si128(filter1, work_a);
488 qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); 507 oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
489 508
490 // Filter2 >> 3 509 // Filter2 >> 3
491 work_a = _mm_cmpgt_epi8(zero, filter2); 510 work_a = _mm_cmpgt_epi8(zero, filter2);
492 filter2 = _mm_srli_epi16(filter2, 3); 511 filter2 = _mm_srli_epi16(filter2, 3);
493 work_a = _mm_and_si128(work_a, te0); 512 work_a = _mm_and_si128(work_a, te0);
494 filter2 = _mm_and_si128(filter2, t1f); 513 filter2 = _mm_and_si128(filter2, t1f);
495 filter2 = _mm_or_si128(filter2, work_a); 514 filter2 = _mm_or_si128(filter2, work_a);
496 ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); 515 op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
497 516
498 // filt >> 1 517 // filt >> 1
499 filt = _mm_adds_epi8(filter1, t1); 518 filt = _mm_adds_epi8(filter1, t1);
500 work_a = _mm_cmpgt_epi8(zero, filt); 519 work_a = _mm_cmpgt_epi8(zero, filt);
501 filt = _mm_srli_epi16(filt, 1); 520 filt = _mm_srli_epi16(filt, 1);
502 work_a = _mm_and_si128(work_a, t80); 521 work_a = _mm_and_si128(work_a, t80);
503 filt = _mm_and_si128(filt, t7f); 522 filt = _mm_and_si128(filt, t7f);
504 filt = _mm_or_si128(filt, work_a); 523 filt = _mm_or_si128(filt, work_a);
505 filt = _mm_andnot_si128(hev, filt); 524 filt = _mm_andnot_si128(hev, filt);
506 ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); 525 op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
507 qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); 526 oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
508 // loopfilter done 527 // loopfilter done
509 528
529 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
530 // filter8
510 { 531 {
511 __m128i work; 532 const __m128i four = _mm_set1_epi16(4);
512 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0), 533 const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
513 _mm_subs_epu8(p0, p2)), 534 const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
514 _mm_or_si128(_mm_subs_epu8(q2, q0), 535 const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
515 _mm_subs_epu8(q0, q2))); 536 const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
516 flat = _mm_max_epu8(work, flat); 537 const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
517 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0), 538 const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
518 _mm_subs_epu8(p0, p3)), 539 const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
519 _mm_or_si128(_mm_subs_epu8(q3, q0), 540 const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
520 _mm_subs_epu8(q0, q3)));
521 flat = _mm_max_epu8(work, flat);
522 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
523 _mm_subs_epu8(p0, p4)),
524 _mm_or_si128(_mm_subs_epu8(q4, q0),
525 _mm_subs_epu8(q0, q4)));
526 flat = _mm_subs_epu8(flat, one);
527 flat = _mm_cmpeq_epi8(flat, zero);
528 flat = _mm_and_si128(flat, mask);
529 541
530 p5 = _mm_loadu_si128((__m128i *)(s - 6 * p)); 542 const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
531 q5 = _mm_loadu_si128((__m128i *)(s + 5 * p)); 543 const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
532 flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0), 544 const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
533 _mm_subs_epu8(p0, p5)), 545 const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
534 _mm_or_si128(_mm_subs_epu8(q5, q0), 546 const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
535 _mm_subs_epu8(q0, q5))); 547 const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
536 _mm_store_si128((__m128i *)&ap[5 * 16], p5); 548 const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
537 _mm_store_si128((__m128i *)&aq[5 * 16], q5); 549 const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
538 flat2 = _mm_max_epu8(work, flat2); 550 __m128i f8_lo, f8_hi;
539 p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
540 q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
541 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0),
542 _mm_subs_epu8(p0, p6)),
543 _mm_or_si128(_mm_subs_epu8(q6, q0),
544 _mm_subs_epu8(q0, q6)));
545 _mm_store_si128((__m128i *)&ap[6 * 16], p6);
546 _mm_store_si128((__m128i *)&aq[6 * 16], q6);
547 flat2 = _mm_max_epu8(work, flat2);
548 551
549 p7 = _mm_loadu_si128((__m128i *)(s - 8 * p)); 552 f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
550 q7 = _mm_loadu_si128((__m128i *)(s + 7 * p)); 553 _mm_add_epi16(p3_lo, p2_lo));
551 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0), 554 f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
552 _mm_subs_epu8(p0, p7)), 555 _mm_add_epi16(p2_lo, p1_lo));
553 _mm_or_si128(_mm_subs_epu8(q7, q0), 556 f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
554 _mm_subs_epu8(q0, q7))); 557
555 _mm_store_si128((__m128i *)&ap[7 * 16], p7); 558 f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
556 _mm_store_si128((__m128i *)&aq[7 * 16], q7); 559 _mm_add_epi16(p3_hi, p2_hi));
557 flat2 = _mm_max_epu8(work, flat2); 560 f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
558 flat2 = _mm_subs_epu8(flat2, one); 561 _mm_add_epi16(p2_hi, p1_hi));
559 flat2 = _mm_cmpeq_epi8(flat2, zero); 562 f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
560 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask 563
564 op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
565
566 f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
567 f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
568 op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
569
570 f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
571 f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
572 op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
573
574 f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
575 f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
576 oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
577
578 f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
579 f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
580 oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
581
582 f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
583 f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
584 oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
561 } 585 }
562 586
563 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 587 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
564 // flat and wide flat calculations 588 // wide flat calculations
565 { 589 {
566 const __m128i eight = _mm_set1_epi16(8); 590 const __m128i eight = _mm_set1_epi16(8);
567 const __m128i four = _mm_set1_epi16(4); 591 const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero);
568 __m128i temp_flat2 = flat2; 592 const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
569 unsigned char *src = s; 593 const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
570 int i = 0; 594 const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
571 do { 595 const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
572 __m128i workp_shft; 596 const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
573 __m128i a, b, c; 597 const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
598 const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
599 const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
600 const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
601 const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
602 const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
603 const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
604 const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
605 const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
606 const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero);
574 607
575 unsigned int off = i * 8; 608 const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero);
576 p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[7 * 16] + off)), 609 const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
577 zero); 610 const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
578 p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[6 * 16] + off)), 611 const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
579 zero); 612 const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
580 p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[5 * 16] + off)), 613 const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
581 zero); 614 const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
582 p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[4 * 16] + off)), 615 const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
583 zero); 616 const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
584 p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[3 * 16] + off)), 617 const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
585 zero); 618 const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
586 p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[2 * 16] + off)), 619 const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
587 zero); 620 const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
588 p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[1 * 16] + off)), 621 const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
589 zero); 622 const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
590 p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[0 * 16] + off)), 623 const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero);
591 zero);
592 q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[0 * 16] + off)),
593 zero);
594 q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[1 * 16] + off)),
595 zero);
596 q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[2 * 16] + off)),
597 zero);
598 q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[3 * 16] + off)),
599 zero);
600 q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[4 * 16] + off)),
601 zero);
602 q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[5 * 16] + off)),
603 zero);
604 q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[6 * 16] + off)),
605 zero);
606 q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[7 * 16] + off)),
607 zero);
608 624
609 c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7); // p7 * 7 625 __m128i f_lo;
610 c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c)); 626 __m128i f_hi;
611 627
612 b = _mm_add_epi16(_mm_add_epi16(p3, four), _mm_add_epi16(p3, p2)); 628 f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo); // p7 * 7
613 a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1)); 629 f_lo = _mm_add_epi16(_mm_slli_epi16(p6_lo, 1),
614 a = _mm_add_epi16(_mm_add_epi16(p0, q0), a); 630 _mm_add_epi16(p4_lo, f_lo));
631 f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
632 _mm_add_epi16(p2_lo, p1_lo));
633 f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
634 f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);
615 635
616 _mm_storel_epi64((__m128i *)&flat_op[2 * 16 + i * 8], 636 f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi); // p7 * 7
617 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) 637 f_hi = _mm_add_epi16(_mm_slli_epi16(p6_hi, 1),
618 , b)); 638 _mm_add_epi16(p4_hi, f_hi));
639 f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
640 _mm_add_epi16(p2_hi, p1_hi));
641 f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
642 f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
619 643
620 c = _mm_add_epi16(_mm_add_epi16(p5, eight), c); 644 p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
621 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 645 _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
622 _mm_storel_epi64((__m128i *)&flat2_op[6 * 16 + i * 8],
623 _mm_packus_epi16(workp_shft, workp_shft));
624 646
625 a = _mm_add_epi16(q1, a); 647 f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
626 b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1); 648 f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
627 _mm_storel_epi64((__m128i *)&flat_op[1 * 16 + i * 8], 649 p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
628 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) 650 _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
629 , b));
630 651
631 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5); 652 f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
632 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 653 f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
633 _mm_storel_epi64((__m128i *)&flat2_op[5 * 16 + i * 8], 654 p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
634 _mm_packus_epi16(workp_shft, workp_shft)); 655 _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
635 656
636 a = _mm_add_epi16(q2, a); 657 f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
637 b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0); 658 f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
638 _mm_storel_epi64((__m128i *)&flat_op[i * 8], 659 p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
639 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) 660 _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
640 , b));
641 661
642 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4); 662 f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
643 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 663 f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
644 _mm_storel_epi64((__m128i *)&flat2_op[4 * 16 + i * 8], 664 op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
645 _mm_packus_epi16(workp_shft, workp_shft)); 665 _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
646 666
647 a = _mm_add_epi16(q3, a); 667 f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
648 b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0); 668 f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
649 _mm_storel_epi64((__m128i *)&flat_oq[i * 8], 669 op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
650 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) 670 _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
651 , b));
652 671
653 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3); 672 f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
654 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 673 f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
655 _mm_storel_epi64((__m128i *)&flat2_op[3 * 16 + i * 8], 674 op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
656 _mm_packus_epi16(workp_shft, workp_shft)); 675 _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
657 676
658 b = _mm_add_epi16(q3, b); 677 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
659 b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1); 678 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
660 _mm_storel_epi64((__m128i *)&flat_oq[16 + i * 8], 679 oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
661 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) 680 _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
662 , b));
663 681
664 c = _mm_add_epi16(q4, c); 682 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
665 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2); 683 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
666 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 684 oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
667 _mm_storel_epi64((__m128i *)&flat2_op[2 * 16 + i * 8], 685 _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
668 _mm_packus_epi16(workp_shft, workp_shft));
669 686
670 b = _mm_add_epi16(q3, b); 687 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
671 b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2); 688 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
672 _mm_storel_epi64((__m128i *)&flat_oq[2 * 16 + i * 8], 689 oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
673 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) 690 _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
674 , b));
675 a = _mm_add_epi16(q5, a);
676 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1);
677 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
678 _mm_storel_epi64((__m128i *)&flat2_op[16 + i * 8],
679 _mm_packus_epi16(workp_shft, workp_shft));
680 691
681 a = _mm_add_epi16(q6, a); 692 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
682 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0); 693 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
683 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 694 q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
684 _mm_storel_epi64((__m128i *)&flat2_op[i * 8], 695 _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
685 _mm_packus_epi16(workp_shft, workp_shft));
686 696
687 a = _mm_add_epi16(q7, a); 697 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
688 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0); 698 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
689 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 699 q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
690 _mm_storel_epi64((__m128i *)&flat2_oq[i * 8], 700 _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
691 _mm_packus_epi16(workp_shft, workp_shft));
692 701
693 a = _mm_add_epi16(q7, a); 702 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
694 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1); 703 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
695 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 704 q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
696 _mm_storel_epi64((__m128i *)&flat2_oq[16 + i * 8], 705 _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
697 _mm_packus_epi16(workp_shft, workp_shft));
698 706
699 a = _mm_add_epi16(q7, a); 707 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
700 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2); 708 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
701 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 709 q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
702 _mm_storel_epi64((__m128i *)&flat2_oq[2 * 16 + i * 8], 710 _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
703 _mm_packus_epi16(workp_shft, workp_shft));
704
705 a = _mm_add_epi16(q7, a);
706 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3);
707 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
708 _mm_storel_epi64((__m128i *)&flat2_oq[3 * 16 + i * 8],
709 _mm_packus_epi16(workp_shft, workp_shft));
710
711 a = _mm_add_epi16(q7, a);
712 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4);
713 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
714 _mm_storel_epi64((__m128i *)&flat2_oq[4 * 16 + i * 8],
715 _mm_packus_epi16(workp_shft, workp_shft));
716
717 a = _mm_add_epi16(q7, a);
718 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5);
719 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
720 _mm_storel_epi64((__m128i *)&flat2_oq[5 * 16 + i * 8],
721 _mm_packus_epi16(workp_shft, workp_shft));
722
723 a = _mm_add_epi16(q7, a);
724 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6);
725 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
726 _mm_storel_epi64((__m128i *)&flat2_oq[6 * 16 + i * 8],
727 _mm_packus_epi16(workp_shft, workp_shft));
728
729 temp_flat2 = _mm_srli_si128(temp_flat2, 8);
730 src += 8;
731 } while (++i < 2);
732 } 711 }
733 // wide flat 712 // wide flat
734 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 713 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
735
736 work_a = _mm_load_si128((__m128i *)&ap[2 * 16]);
737 p2 = _mm_load_si128((__m128i *)&flat_op[2 * 16]);
738 work_a = _mm_andnot_si128(flat, work_a);
739 p2 = _mm_and_si128(flat, p2);
740 p2 = _mm_or_si128(work_a, p2);
741 _mm_store_si128((__m128i *)&flat_op[2 * 16], p2);
742
743 p1 = _mm_load_si128((__m128i *)&flat_op[1 * 16]);
744 work_a = _mm_andnot_si128(flat, ps1);
745 p1 = _mm_and_si128(flat, p1);
746 p1 = _mm_or_si128(work_a, p1);
747 _mm_store_si128((__m128i *)&flat_op[1 * 16], p1);
748
749 p0 = _mm_load_si128((__m128i *)&flat_op[0]);
750 work_a = _mm_andnot_si128(flat, ps0);
751 p0 = _mm_and_si128(flat, p0);
752 p0 = _mm_or_si128(work_a, p0);
753 _mm_store_si128((__m128i *)&flat_op[0], p0);
754
755 q0 = _mm_load_si128((__m128i *)&flat_oq[0]);
756 work_a = _mm_andnot_si128(flat, qs0);
757 q0 = _mm_and_si128(flat, q0);
758 q0 = _mm_or_si128(work_a, q0);
759 _mm_store_si128((__m128i *)&flat_oq[0], q0);
760
761 q1 = _mm_load_si128((__m128i *)&flat_oq[1 * 16]);
762 work_a = _mm_andnot_si128(flat, qs1);
763 q1 = _mm_and_si128(flat, q1);
764 q1 = _mm_or_si128(work_a, q1);
765 _mm_store_si128((__m128i *)&flat_oq[1 * 16], q1);
766
767 work_a = _mm_load_si128((__m128i *)&aq[2 * 16]);
768 q2 = _mm_load_si128((__m128i *)&flat_oq[2 * 16]);
769 work_a = _mm_andnot_si128(flat, work_a);
770 q2 = _mm_and_si128(flat, q2);
771 q2 = _mm_or_si128(work_a, q2);
772 _mm_store_si128((__m128i *)&flat_oq[2 * 16], q2);
773
774 // write out op6 - op3
775 {
776 unsigned char *dst = (s - 7 * p);
777 for (i = 6; i > 2; i--) {
778 __m128i flat2_output;
779 work_a = _mm_load_si128((__m128i *)&ap[i * 16]);
780 flat2_output = _mm_load_si128((__m128i *)&flat2_op[i * 16]);
781 work_a = _mm_andnot_si128(flat2, work_a);
782 flat2_output = _mm_and_si128(flat2, flat2_output);
783 work_a = _mm_or_si128(work_a, flat2_output);
784 _mm_storeu_si128((__m128i *)dst, work_a);
785 dst += p;
786 }
787 }
788
789 work_a = _mm_load_si128((__m128i *)&flat_op[2 * 16]);
790 p2 = _mm_load_si128((__m128i *)&flat2_op[2 * 16]);
791 work_a = _mm_andnot_si128(flat2, work_a);
792 p2 = _mm_and_si128(flat2, p2);
793 p2 = _mm_or_si128(work_a, p2);
794 _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
795
796 work_a = _mm_load_si128((__m128i *)&flat_op[1 * 16]);
797 p1 = _mm_load_si128((__m128i *)&flat2_op[1 * 16]);
798 work_a = _mm_andnot_si128(flat2, work_a);
799 p1 = _mm_and_si128(flat2, p1);
800 p1 = _mm_or_si128(work_a, p1);
801 _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
802
803 work_a = _mm_load_si128((__m128i *)&flat_op[0]);
804 p0 = _mm_load_si128((__m128i *)&flat2_op[0]);
805 work_a = _mm_andnot_si128(flat2, work_a);
806 p0 = _mm_and_si128(flat2, p0);
807 p0 = _mm_or_si128(work_a, p0);
808 _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
809
810 work_a = _mm_load_si128((__m128i *)&flat_oq[0]);
811 q0 = _mm_load_si128((__m128i *)&flat2_oq[0]);
812 work_a = _mm_andnot_si128(flat2, work_a);
813 q0 = _mm_and_si128(flat2, q0);
814 q0 = _mm_or_si128(work_a, q0);
815 _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
816
817 work_a = _mm_load_si128((__m128i *)&flat_oq[1 * 16]);
818 q1 = _mm_load_si128((__m128i *)&flat2_oq[16]);
819 work_a = _mm_andnot_si128(flat2, work_a);
820 q1 = _mm_and_si128(flat2, q1);
821 q1 = _mm_or_si128(work_a, q1);
822 _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
823
824 work_a = _mm_load_si128((__m128i *)&flat_oq[2 * 16]);
825 q2 = _mm_load_si128((__m128i *)&flat2_oq[2 * 16]);
826 work_a = _mm_andnot_si128(flat2, work_a);
827 q2 = _mm_and_si128(flat2, q2);
828 q2 = _mm_or_si128(work_a, q2);
829 _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
830
831 // write out oq3 - oq7
832 {
833 unsigned char *dst = (s + 3 * p);
834 for (i = 3; i < 7; i++) {
835 __m128i flat2_output;
836 work_a = _mm_load_si128((__m128i *)&aq[i * 16]);
837 flat2_output = _mm_load_si128((__m128i *)&flat2_oq[i * 16]);
838 work_a = _mm_andnot_si128(flat2, work_a);
839 flat2_output = _mm_and_si128(flat2, flat2_output);
840 work_a = _mm_or_si128(work_a, flat2_output);
841 _mm_storeu_si128((__m128i *)dst, work_a);
842 dst += p;
843 }
844 }
845 } 714 }
846 } 715 }
847 716
848 // TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly. 717 // TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.
849 void vp9_lpf_horizontal_16_sse2(unsigned char *s, int p, 718 void vp9_lpf_horizontal_16_sse2(unsigned char *s, int p,
850 const unsigned char *_blimit, 719 const unsigned char *_blimit,
851 const unsigned char *_limit, 720 const unsigned char *_limit,
852 const unsigned char *_thresh, int count) { 721 const unsigned char *_thresh, int count) {
853 if (count == 1) 722 if (count == 1)
854 mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh); 723 mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh);
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after
886 _mm_loadl_epi64((__m128i *)(s - 0 * p))); 755 _mm_loadl_epi64((__m128i *)(s - 0 * p)));
887 p1q1 = _mm_shuffle_epi32(q1p1, 78); 756 p1q1 = _mm_shuffle_epi32(q1p1, 78);
888 p0q0 = _mm_shuffle_epi32(q0p0, 78); 757 p0q0 = _mm_shuffle_epi32(q0p0, 78);
889 758
890 { 759 {
891 // filter_mask and hev_mask 760 // filter_mask and hev_mask
892 const __m128i one = _mm_set1_epi8(1); 761 const __m128i one = _mm_set1_epi8(1);
893 const __m128i fe = _mm_set1_epi8(0xfe); 762 const __m128i fe = _mm_set1_epi8(0xfe);
894 const __m128i ff = _mm_cmpeq_epi8(fe, fe); 763 const __m128i ff = _mm_cmpeq_epi8(fe, fe);
895 __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; 764 __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
896 abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), 765 abs_p1p0 = abs_diff(q1p1, q0p0);
897 _mm_subs_epu8(q0p0, q1p1));
898 abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); 766 abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
899 767
900 abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), 768 abs_p0q0 = abs_diff(q0p0, p0q0);
901 _mm_subs_epu8(p0q0, q0p0)); 769 abs_p1q1 = abs_diff(q1p1, p1q1);
902 abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
903 _mm_subs_epu8(p1q1, q1p1));
904 flat = _mm_max_epu8(abs_p1p0, abs_q1q0); 770 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
905 hev = _mm_subs_epu8(flat, thresh); 771 hev = _mm_subs_epu8(flat, thresh);
906 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); 772 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
907 773
908 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); 774 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
909 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); 775 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
910 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); 776 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
911 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); 777 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
912 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; 778 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
913 mask = _mm_max_epu8(abs_p1p0, mask); 779 mask = _mm_max_epu8(abs_p1p0, mask);
914 // mask |= (abs(p1 - p0) > limit) * -1; 780 // mask |= (abs(p1 - p0) > limit) * -1;
915 // mask |= (abs(q1 - q0) > limit) * -1; 781 // mask |= (abs(q1 - q0) > limit) * -1;
916 782
917 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q1p1), 783 work = _mm_max_epu8(abs_diff(q2p2, q1p1),
918 _mm_subs_epu8(q1p1, q2p2)), 784 abs_diff(q3p3, q2p2));
919 _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
920 _mm_subs_epu8(q2p2, q3p3)));
921 mask = _mm_max_epu8(work, mask); 785 mask = _mm_max_epu8(work, mask);
922 mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); 786 mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
923 mask = _mm_subs_epu8(mask, limit); 787 mask = _mm_subs_epu8(mask, limit);
924 mask = _mm_cmpeq_epi8(mask, zero); 788 mask = _mm_cmpeq_epi8(mask, zero);
925 789
926 // flat_mask4 790 // flat_mask4
927 791
928 flat = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q0p0), 792 flat = _mm_max_epu8(abs_diff(q2p2, q0p0),
929 _mm_subs_epu8(q0p0, q2p2)), 793 abs_diff(q3p3, q0p0));
930 _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
931 _mm_subs_epu8(q0p0, q3p3)));
932 flat = _mm_max_epu8(abs_p1p0, flat); 794 flat = _mm_max_epu8(abs_p1p0, flat);
933 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); 795 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
934 flat = _mm_subs_epu8(flat, one); 796 flat = _mm_subs_epu8(flat, one);
935 flat = _mm_cmpeq_epi8(flat, zero); 797 flat = _mm_cmpeq_epi8(flat, zero);
936 flat = _mm_and_si128(flat, mask); 798 flat = _mm_and_si128(flat, mask);
937 } 799 }
938 800
939 { 801 {
940 const __m128i four = _mm_set1_epi16(4); 802 const __m128i four = _mm_set1_epi16(4);
941 unsigned char *src = s; 803 unsigned char *src = s;
(...skipping 765 matching lines...) Expand 10 before | Expand all | Expand 10 after
1707 transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); 1569 transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
1708 1570
1709 // Loop filtering 1571 // Loop filtering
1710 mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit, 1572 mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit,
1711 thresh); 1573 thresh);
1712 1574
1713 // Transpose back 1575 // Transpose back
1714 transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); 1576 transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
1715 transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); 1577 transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
1716 } 1578 }
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c ('k') | source/libvpx/vp9/decoder/vp9_decodeframe.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698