| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include <emmintrin.h> // SSE2 | 11 #include <emmintrin.h> // SSE2 |
| 12 #include "vp9/common/vp9_loopfilter.h" | 12 #include "vp9/common/vp9_loopfilter.h" |
| 13 #include "vpx_ports/emmintrin_compat.h" | 13 #include "vpx_ports/emmintrin_compat.h" |
| 14 | 14 |
| 15 static INLINE __m128i abs_diff(__m128i a, __m128i b) { |
| 16 return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); |
| 17 } |
| 18 |
| 15 static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s, | 19 static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s, |
| 16 int p, | 20 int p, |
| 17 const unsigned char *_blimit, | 21 const unsigned char *_blimit, |
| 18 const unsigned char *_limit, | 22 const unsigned char *_limit, |
| 19 const unsigned char *_thresh) { | 23 const unsigned char *_thresh) { |
| 20 const __m128i zero = _mm_set1_epi16(0); | 24 const __m128i zero = _mm_set1_epi16(0); |
| 21 const __m128i one = _mm_set1_epi8(1); | 25 const __m128i one = _mm_set1_epi8(1); |
| 22 const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); | 26 const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); |
| 23 const __m128i limit = _mm_load_si128((const __m128i *)_limit); | 27 const __m128i limit = _mm_load_si128((const __m128i *)_limit); |
| 24 const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); | 28 const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); |
| (...skipping 14 matching lines...) Expand all Loading... |
| 39 q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1), | 43 q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1), |
| 40 (__m64 *)(s + 1 * p))); | 44 (__m64 *)(s + 1 * p))); |
| 41 p1q1 = _mm_shuffle_epi32(q1p1, 78); | 45 p1q1 = _mm_shuffle_epi32(q1p1, 78); |
| 42 q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); | 46 q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); |
| 43 q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0), | 47 q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0), |
| 44 (__m64 *)(s - 0 * p))); | 48 (__m64 *)(s - 0 * p))); |
| 45 p0q0 = _mm_shuffle_epi32(q0p0, 78); | 49 p0q0 = _mm_shuffle_epi32(q0p0, 78); |
| 46 | 50 |
| 47 { | 51 { |
| 48 __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; | 52 __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; |
| 49 abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), | 53 abs_p1p0 = abs_diff(q1p1, q0p0); |
| 50 _mm_subs_epu8(q0p0, q1p1)); | |
| 51 abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); | 54 abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); |
| 52 fe = _mm_set1_epi8(0xfe); | 55 fe = _mm_set1_epi8(0xfe); |
| 53 ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); | 56 ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); |
| 54 abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), | 57 abs_p0q0 = abs_diff(q0p0, p0q0); |
| 55 _mm_subs_epu8(p0q0, q0p0)); | 58 abs_p1q1 = abs_diff(q1p1, p1q1); |
| 56 abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), | |
| 57 _mm_subs_epu8(p1q1, q1p1)); | |
| 58 flat = _mm_max_epu8(abs_p1p0, abs_q1q0); | 59 flat = _mm_max_epu8(abs_p1p0, abs_q1q0); |
| 59 hev = _mm_subs_epu8(flat, thresh); | 60 hev = _mm_subs_epu8(flat, thresh); |
| 60 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); | 61 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); |
| 61 | 62 |
| 62 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); | 63 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); |
| 63 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); | 64 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); |
| 64 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); | 65 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); |
| 65 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); | 66 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); |
| 66 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; | 67 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; |
| 67 mask = _mm_max_epu8(abs_p1p0, mask); | 68 mask = _mm_max_epu8(abs_p1p0, mask); |
| 68 // mask |= (abs(p1 - p0) > limit) * -1; | 69 // mask |= (abs(p1 - p0) > limit) * -1; |
| 69 // mask |= (abs(q1 - q0) > limit) * -1; | 70 // mask |= (abs(q1 - q0) > limit) * -1; |
| 70 | 71 |
| 71 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q1p1), | 72 work = _mm_max_epu8(abs_diff(q2p2, q1p1), |
| 72 _mm_subs_epu8(q1p1, q2p2)), | 73 abs_diff(q3p3, q2p2)); |
| 73 _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), | |
| 74 _mm_subs_epu8(q2p2, q3p3))); | |
| 75 mask = _mm_max_epu8(work, mask); | 74 mask = _mm_max_epu8(work, mask); |
| 76 mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); | 75 mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); |
| 77 mask = _mm_subs_epu8(mask, limit); | 76 mask = _mm_subs_epu8(mask, limit); |
| 78 mask = _mm_cmpeq_epi8(mask, zero); | 77 mask = _mm_cmpeq_epi8(mask, zero); |
| 79 } | 78 } |
| 80 | 79 |
| 81 // lp filter | 80 // lp filter |
| 82 { | 81 { |
| 83 const __m128i t4 = _mm_set1_epi8(4); | 82 const __m128i t4 = _mm_set1_epi8(4); |
| 84 const __m128i t3 = _mm_set1_epi8(3); | 83 const __m128i t3 = _mm_set1_epi8(3); |
| (...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 118 filt = _mm_adds_epi16(filter1, t1); | 117 filt = _mm_adds_epi16(filter1, t1); |
| 119 filt = _mm_srai_epi16(filt, 1); | 118 filt = _mm_srai_epi16(filt, 1); |
| 120 filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), | 119 filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), |
| 121 filt); | 120 filt); |
| 122 filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); | 121 filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); |
| 123 qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); | 122 qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); |
| 124 // loopfilter done | 123 // loopfilter done |
| 125 | 124 |
| 126 { | 125 { |
| 127 __m128i work; | 126 __m128i work; |
| 128 flat = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q0p0), | 127 flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); |
| 129 _mm_subs_epu8(q0p0, q2p2)), | |
| 130 _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), | |
| 131 _mm_subs_epu8(q0p0, q3p3))); | |
| 132 flat = _mm_max_epu8(abs_p1p0, flat); | 128 flat = _mm_max_epu8(abs_p1p0, flat); |
| 133 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); | 129 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); |
| 134 flat = _mm_subs_epu8(flat, one); | 130 flat = _mm_subs_epu8(flat, one); |
| 135 flat = _mm_cmpeq_epi8(flat, zero); | 131 flat = _mm_cmpeq_epi8(flat, zero); |
| 136 flat = _mm_and_si128(flat, mask); | 132 flat = _mm_and_si128(flat, mask); |
| 137 | 133 |
| 138 q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); | 134 q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); |
| 139 q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5), | 135 q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5), |
| 140 (__m64 *)(s + 5 * p))); | 136 (__m64 *)(s + 5 * p))); |
| 141 | 137 |
| 142 q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); | 138 q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); |
| 143 q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6), | 139 q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6), |
| 144 (__m64 *)(s + 6 * p))); | 140 (__m64 *)(s + 6 * p))); |
| 145 | 141 flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0)); |
| 146 flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q4p4, q0p0), | |
| 147 _mm_subs_epu8(q0p0, q4p4)), | |
| 148 _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), | |
| 149 _mm_subs_epu8(q0p0, q5p5))); | |
| 150 | 142 |
| 151 q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); | 143 q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); |
| 152 q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7), | 144 q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7), |
| 153 (__m64 *)(s + 7 * p))); | 145 (__m64 *)(s + 7 * p))); |
| 154 | 146 work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0)); |
| 155 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q6p6, q0p0), | |
| 156 _mm_subs_epu8(q0p0, q6p6)), | |
| 157 _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), | |
| 158 _mm_subs_epu8(q0p0, q7p7))); | |
| 159 | |
| 160 flat2 = _mm_max_epu8(work, flat2); | 147 flat2 = _mm_max_epu8(work, flat2); |
| 161 flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); | 148 flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); |
| 162 flat2 = _mm_subs_epu8(flat2, one); | 149 flat2 = _mm_subs_epu8(flat2, one); |
| 163 flat2 = _mm_cmpeq_epi8(flat2, zero); | 150 flat2 = _mm_cmpeq_epi8(flat2, zero); |
| 164 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask | 151 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask |
| 165 } | 152 } |
| 166 | 153 |
| 167 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 154 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 168 // flat and wide flat calculations | 155 // flat and wide flat calculations |
| 169 { | 156 { |
| (...skipping 187 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 357 _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); | 344 _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); |
| 358 | 345 |
| 359 q0p0 = _mm_andnot_si128(flat2, q0p0); | 346 q0p0 = _mm_andnot_si128(flat2, q0p0); |
| 360 flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); | 347 flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); |
| 361 q0p0 = _mm_or_si128(q0p0, flat2_q0p0); | 348 q0p0 = _mm_or_si128(q0p0, flat2_q0p0); |
| 362 _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); | 349 _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); |
| 363 _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); | 350 _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); |
| 364 } | 351 } |
| 365 } | 352 } |
| 366 | 353 |
| 354 static INLINE __m128i filter_add2_sub2(const __m128i *const total, |
| 355 const __m128i *const a1, |
| 356 const __m128i *const a2, |
| 357 const __m128i *const s1, |
| 358 const __m128i *const s2) { |
| 359 __m128i x = _mm_add_epi16(*a1, *total); |
| 360 x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2); |
| 361 return x; |
| 362 } |
| 363 |
| 364 static INLINE __m128i filter8_mask(const __m128i *const flat, |
| 365 const __m128i *const other_filt, |
| 366 const __m128i *const f8_lo, |
| 367 const __m128i *const f8_hi) { |
| 368 const __m128i f8 = _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), |
| 369 _mm_srli_epi16(*f8_hi, 3)); |
| 370 const __m128i result = _mm_and_si128(*flat, f8); |
| 371 return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); |
| 372 } |
| 373 |
| 374 static INLINE __m128i filter16_mask(const __m128i *const flat, |
| 375 const __m128i *const other_filt, |
| 376 const __m128i *const f_lo, |
| 377 const __m128i *const f_hi) { |
| 378 const __m128i f = _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), |
| 379 _mm_srli_epi16(*f_hi, 4)); |
| 380 const __m128i result = _mm_and_si128(*flat, f); |
| 381 return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); |
| 382 } |
| 383 |
| 367 static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, | 384 static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, |
| 368 int p, | 385 int p, |
| 369 const unsigned char *_blimit, | 386 const unsigned char *_blimit, |
| 370 const unsigned char *_limit, | 387 const unsigned char *_limit, |
| 371 const unsigned char *_thresh) { | 388 const unsigned char *_thresh) { |
| 372 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat2_op, 7 * 16); | |
| 373 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat2_oq, 7 * 16); | |
| 374 | |
| 375 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op, 3 * 16); | |
| 376 DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq, 3 * 16); | |
| 377 | |
| 378 DECLARE_ALIGNED_ARRAY(16, unsigned char, ap, 8 * 16); | |
| 379 DECLARE_ALIGNED_ARRAY(16, unsigned char, aq, 8 * 16); | |
| 380 | |
| 381 const __m128i zero = _mm_set1_epi16(0); | 389 const __m128i zero = _mm_set1_epi16(0); |
| 382 const __m128i one = _mm_set1_epi8(1); | 390 const __m128i one = _mm_set1_epi8(1); |
| 383 const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); | 391 const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); |
| 384 const __m128i limit = _mm_load_si128((const __m128i *)_limit); | 392 const __m128i limit = _mm_load_si128((const __m128i *)_limit); |
| 385 const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); | 393 const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); |
| 386 __m128i mask, hev, flat, flat2; | 394 __m128i mask, hev, flat, flat2; |
| 387 __m128i p7, p6, p5; | 395 __m128i p7, p6, p5; |
| 388 __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; | 396 __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; |
| 389 __m128i q5, q6, q7; | 397 __m128i q5, q6, q7; |
| 390 int i = 0; | |
| 391 | 398 |
| 399 __m128i op2, op1, op0, oq0, oq1, oq2; |
| 400 |
| 401 __m128i max_abs_p1p0q1q0; |
| 402 |
| 403 p7 = _mm_loadu_si128((__m128i *)(s - 8 * p)); |
| 404 p6 = _mm_loadu_si128((__m128i *)(s - 7 * p)); |
| 405 p5 = _mm_loadu_si128((__m128i *)(s - 6 * p)); |
| 392 p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); | 406 p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); |
| 393 p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); | 407 p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); |
| 394 p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); | 408 p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); |
| 395 p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); | 409 p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); |
| 396 p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); | 410 p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); |
| 397 q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); | 411 q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); |
| 398 q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); | 412 q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); |
| 399 q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); | 413 q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); |
| 400 q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); | 414 q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); |
| 401 q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); | 415 q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); |
| 402 | 416 q5 = _mm_loadu_si128((__m128i *)(s + 5 * p)); |
| 403 _mm_store_si128((__m128i *)&ap[4 * 16], p4); | 417 q6 = _mm_loadu_si128((__m128i *)(s + 6 * p)); |
| 404 _mm_store_si128((__m128i *)&ap[3 * 16], p3); | 418 q7 = _mm_loadu_si128((__m128i *)(s + 7 * p)); |
| 405 _mm_store_si128((__m128i *)&ap[2 * 16], p2); | |
| 406 _mm_store_si128((__m128i *)&ap[1 * 16], p1); | |
| 407 _mm_store_si128((__m128i *)&ap[0 * 16], p0); | |
| 408 _mm_store_si128((__m128i *)&aq[4 * 16], q4); | |
| 409 _mm_store_si128((__m128i *)&aq[3 * 16], q3); | |
| 410 _mm_store_si128((__m128i *)&aq[2 * 16], q2); | |
| 411 _mm_store_si128((__m128i *)&aq[1 * 16], q1); | |
| 412 _mm_store_si128((__m128i *)&aq[0 * 16], q0); | |
| 413 | |
| 414 | 419 |
| 415 { | 420 { |
| 416 const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), | 421 const __m128i abs_p1p0 = abs_diff(p1, p0); |
| 417 _mm_subs_epu8(p0, p1)); | 422 const __m128i abs_q1q0 = abs_diff(q1, q0); |
| 418 const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), | |
| 419 _mm_subs_epu8(q0, q1)); | |
| 420 const __m128i fe = _mm_set1_epi8(0xfe); | 423 const __m128i fe = _mm_set1_epi8(0xfe); |
| 421 const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); | 424 const __m128i ff = _mm_cmpeq_epi8(zero, zero); |
| 422 __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), | 425 __m128i abs_p0q0 = abs_diff(p0, q0); |
| 423 _mm_subs_epu8(q0, p0)); | 426 __m128i abs_p1q1 = abs_diff(p1, q1); |
| 424 __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), | |
| 425 _mm_subs_epu8(q1, p1)); | |
| 426 __m128i work; | 427 __m128i work; |
| 427 flat = _mm_max_epu8(abs_p1p0, abs_q1q0); | 428 max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0); |
| 428 hev = _mm_subs_epu8(flat, thresh); | |
| 429 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); | |
| 430 | 429 |
| 431 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); | 430 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); |
| 432 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); | 431 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); |
| 433 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); | 432 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); |
| 434 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); | 433 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); |
| 435 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; | 434 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; |
| 436 mask = _mm_max_epu8(flat, mask); | 435 mask = _mm_max_epu8(max_abs_p1p0q1q0, mask); |
| 437 // mask |= (abs(p1 - p0) > limit) * -1; | 436 // mask |= (abs(p1 - p0) > limit) * -1; |
| 438 // mask |= (abs(q1 - q0) > limit) * -1; | 437 // mask |= (abs(q1 - q0) > limit) * -1; |
| 439 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1), | 438 work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2)); |
| 440 _mm_subs_epu8(p1, p2)), | |
| 441 _mm_or_si128(_mm_subs_epu8(p3, p2), | |
| 442 _mm_subs_epu8(p2, p3))); | |
| 443 mask = _mm_max_epu8(work, mask); | 439 mask = _mm_max_epu8(work, mask); |
| 444 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1), | 440 work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2)); |
| 445 _mm_subs_epu8(q1, q2)), | |
| 446 _mm_or_si128(_mm_subs_epu8(q3, q2), | |
| 447 _mm_subs_epu8(q2, q3))); | |
| 448 mask = _mm_max_epu8(work, mask); | 441 mask = _mm_max_epu8(work, mask); |
| 449 mask = _mm_subs_epu8(mask, limit); | 442 mask = _mm_subs_epu8(mask, limit); |
| 450 mask = _mm_cmpeq_epi8(mask, zero); | 443 mask = _mm_cmpeq_epi8(mask, zero); |
| 451 } | 444 } |
| 452 | 445 |
| 453 // lp filter | 446 { |
| 447 __m128i work; |
| 448 work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0)); |
| 449 flat = _mm_max_epu8(work, max_abs_p1p0q1q0); |
| 450 work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0)); |
| 451 flat = _mm_max_epu8(work, flat); |
| 452 work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0)); |
| 453 flat = _mm_subs_epu8(flat, one); |
| 454 flat = _mm_cmpeq_epi8(flat, zero); |
| 455 flat = _mm_and_si128(flat, mask); |
| 456 flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0)); |
| 457 flat2 = _mm_max_epu8(work, flat2); |
| 458 work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0)); |
| 459 flat2 = _mm_max_epu8(work, flat2); |
| 460 work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0)); |
| 461 flat2 = _mm_max_epu8(work, flat2); |
| 462 flat2 = _mm_subs_epu8(flat2, one); |
| 463 flat2 = _mm_cmpeq_epi8(flat2, zero); |
| 464 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask |
| 465 } |
| 466 |
| 467 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 468 // filter4 |
| 454 { | 469 { |
| 455 const __m128i t4 = _mm_set1_epi8(4); | 470 const __m128i t4 = _mm_set1_epi8(4); |
| 456 const __m128i t3 = _mm_set1_epi8(3); | 471 const __m128i t3 = _mm_set1_epi8(3); |
| 457 const __m128i t80 = _mm_set1_epi8(0x80); | 472 const __m128i t80 = _mm_set1_epi8(0x80); |
| 458 const __m128i te0 = _mm_set1_epi8(0xe0); | 473 const __m128i te0 = _mm_set1_epi8(0xe0); |
| 459 const __m128i t1f = _mm_set1_epi8(0x1f); | 474 const __m128i t1f = _mm_set1_epi8(0x1f); |
| 460 const __m128i t1 = _mm_set1_epi8(0x1); | 475 const __m128i t1 = _mm_set1_epi8(0x1); |
| 461 const __m128i t7f = _mm_set1_epi8(0x7f); | 476 const __m128i t7f = _mm_set1_epi8(0x7f); |
| 477 const __m128i ff = _mm_cmpeq_epi8(t4, t4); |
| 462 | 478 |
| 463 __m128i ps1 = _mm_xor_si128(p1, t80); | |
| 464 __m128i ps0 = _mm_xor_si128(p0, t80); | |
| 465 __m128i qs0 = _mm_xor_si128(q0, t80); | |
| 466 __m128i qs1 = _mm_xor_si128(q1, t80); | |
| 467 __m128i filt; | 479 __m128i filt; |
| 468 __m128i work_a; | 480 __m128i work_a; |
| 469 __m128i filter1, filter2; | 481 __m128i filter1, filter2; |
| 470 | 482 |
| 471 filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); | 483 op1 = _mm_xor_si128(p1, t80); |
| 472 work_a = _mm_subs_epi8(qs0, ps0); | 484 op0 = _mm_xor_si128(p0, t80); |
| 485 oq0 = _mm_xor_si128(q0, t80); |
| 486 oq1 = _mm_xor_si128(q1, t80); |
| 487 |
| 488 hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh); |
| 489 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); |
| 490 filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev); |
| 491 |
| 492 work_a = _mm_subs_epi8(oq0, op0); |
| 473 filt = _mm_adds_epi8(filt, work_a); | 493 filt = _mm_adds_epi8(filt, work_a); |
| 474 filt = _mm_adds_epi8(filt, work_a); | 494 filt = _mm_adds_epi8(filt, work_a); |
| 475 filt = _mm_adds_epi8(filt, work_a); | 495 filt = _mm_adds_epi8(filt, work_a); |
| 476 // (vp9_filter + 3 * (qs0 - ps0)) & mask | 496 // (vp9_filter + 3 * (qs0 - ps0)) & mask |
| 477 filt = _mm_and_si128(filt, mask); | 497 filt = _mm_and_si128(filt, mask); |
| 478 | |
| 479 filter1 = _mm_adds_epi8(filt, t4); | 498 filter1 = _mm_adds_epi8(filt, t4); |
| 480 filter2 = _mm_adds_epi8(filt, t3); | 499 filter2 = _mm_adds_epi8(filt, t3); |
| 481 | 500 |
| 482 // Filter1 >> 3 | 501 // Filter1 >> 3 |
| 483 work_a = _mm_cmpgt_epi8(zero, filter1); | 502 work_a = _mm_cmpgt_epi8(zero, filter1); |
| 484 filter1 = _mm_srli_epi16(filter1, 3); | 503 filter1 = _mm_srli_epi16(filter1, 3); |
| 485 work_a = _mm_and_si128(work_a, te0); | 504 work_a = _mm_and_si128(work_a, te0); |
| 486 filter1 = _mm_and_si128(filter1, t1f); | 505 filter1 = _mm_and_si128(filter1, t1f); |
| 487 filter1 = _mm_or_si128(filter1, work_a); | 506 filter1 = _mm_or_si128(filter1, work_a); |
| 488 qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); | 507 oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80); |
| 489 | 508 |
| 490 // Filter2 >> 3 | 509 // Filter2 >> 3 |
| 491 work_a = _mm_cmpgt_epi8(zero, filter2); | 510 work_a = _mm_cmpgt_epi8(zero, filter2); |
| 492 filter2 = _mm_srli_epi16(filter2, 3); | 511 filter2 = _mm_srli_epi16(filter2, 3); |
| 493 work_a = _mm_and_si128(work_a, te0); | 512 work_a = _mm_and_si128(work_a, te0); |
| 494 filter2 = _mm_and_si128(filter2, t1f); | 513 filter2 = _mm_and_si128(filter2, t1f); |
| 495 filter2 = _mm_or_si128(filter2, work_a); | 514 filter2 = _mm_or_si128(filter2, work_a); |
| 496 ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); | 515 op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80); |
| 497 | 516 |
| 498 // filt >> 1 | 517 // filt >> 1 |
| 499 filt = _mm_adds_epi8(filter1, t1); | 518 filt = _mm_adds_epi8(filter1, t1); |
| 500 work_a = _mm_cmpgt_epi8(zero, filt); | 519 work_a = _mm_cmpgt_epi8(zero, filt); |
| 501 filt = _mm_srli_epi16(filt, 1); | 520 filt = _mm_srli_epi16(filt, 1); |
| 502 work_a = _mm_and_si128(work_a, t80); | 521 work_a = _mm_and_si128(work_a, t80); |
| 503 filt = _mm_and_si128(filt, t7f); | 522 filt = _mm_and_si128(filt, t7f); |
| 504 filt = _mm_or_si128(filt, work_a); | 523 filt = _mm_or_si128(filt, work_a); |
| 505 filt = _mm_andnot_si128(hev, filt); | 524 filt = _mm_andnot_si128(hev, filt); |
| 506 ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); | 525 op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80); |
| 507 qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); | 526 oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80); |
| 508 // loopfilter done | 527 // loopfilter done |
| 509 | 528 |
| 529 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 530 // filter8 |
| 510 { | 531 { |
| 511 __m128i work; | 532 const __m128i four = _mm_set1_epi16(4); |
| 512 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0), | 533 const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero); |
| 513 _mm_subs_epu8(p0, p2)), | 534 const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero); |
| 514 _mm_or_si128(_mm_subs_epu8(q2, q0), | 535 const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero); |
| 515 _mm_subs_epu8(q0, q2))); | 536 const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero); |
| 516 flat = _mm_max_epu8(work, flat); | 537 const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero); |
| 517 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0), | 538 const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero); |
| 518 _mm_subs_epu8(p0, p3)), | 539 const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero); |
| 519 _mm_or_si128(_mm_subs_epu8(q3, q0), | 540 const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero); |
| 520 _mm_subs_epu8(q0, q3))); | |
| 521 flat = _mm_max_epu8(work, flat); | |
| 522 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0), | |
| 523 _mm_subs_epu8(p0, p4)), | |
| 524 _mm_or_si128(_mm_subs_epu8(q4, q0), | |
| 525 _mm_subs_epu8(q0, q4))); | |
| 526 flat = _mm_subs_epu8(flat, one); | |
| 527 flat = _mm_cmpeq_epi8(flat, zero); | |
| 528 flat = _mm_and_si128(flat, mask); | |
| 529 | 541 |
| 530 p5 = _mm_loadu_si128((__m128i *)(s - 6 * p)); | 542 const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero); |
| 531 q5 = _mm_loadu_si128((__m128i *)(s + 5 * p)); | 543 const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero); |
| 532 flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0), | 544 const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero); |
| 533 _mm_subs_epu8(p0, p5)), | 545 const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero); |
| 534 _mm_or_si128(_mm_subs_epu8(q5, q0), | 546 const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero); |
| 535 _mm_subs_epu8(q0, q5))); | 547 const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero); |
| 536 _mm_store_si128((__m128i *)&ap[5 * 16], p5); | 548 const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero); |
| 537 _mm_store_si128((__m128i *)&aq[5 * 16], q5); | 549 const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero); |
| 538 flat2 = _mm_max_epu8(work, flat2); | 550 __m128i f8_lo, f8_hi; |
| 539 p6 = _mm_loadu_si128((__m128i *)(s - 7 * p)); | |
| 540 q6 = _mm_loadu_si128((__m128i *)(s + 6 * p)); | |
| 541 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0), | |
| 542 _mm_subs_epu8(p0, p6)), | |
| 543 _mm_or_si128(_mm_subs_epu8(q6, q0), | |
| 544 _mm_subs_epu8(q0, q6))); | |
| 545 _mm_store_si128((__m128i *)&ap[6 * 16], p6); | |
| 546 _mm_store_si128((__m128i *)&aq[6 * 16], q6); | |
| 547 flat2 = _mm_max_epu8(work, flat2); | |
| 548 | 551 |
| 549 p7 = _mm_loadu_si128((__m128i *)(s - 8 * p)); | 552 f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four), |
| 550 q7 = _mm_loadu_si128((__m128i *)(s + 7 * p)); | 553 _mm_add_epi16(p3_lo, p2_lo)); |
| 551 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0), | 554 f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo), |
| 552 _mm_subs_epu8(p0, p7)), | 555 _mm_add_epi16(p2_lo, p1_lo)); |
| 553 _mm_or_si128(_mm_subs_epu8(q7, q0), | 556 f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo); |
| 554 _mm_subs_epu8(q0, q7))); | 557 |
| 555 _mm_store_si128((__m128i *)&ap[7 * 16], p7); | 558 f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four), |
| 556 _mm_store_si128((__m128i *)&aq[7 * 16], q7); | 559 _mm_add_epi16(p3_hi, p2_hi)); |
| 557 flat2 = _mm_max_epu8(work, flat2); | 560 f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi), |
| 558 flat2 = _mm_subs_epu8(flat2, one); | 561 _mm_add_epi16(p2_hi, p1_hi)); |
| 559 flat2 = _mm_cmpeq_epi8(flat2, zero); | 562 f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi); |
| 560 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask | 563 |
| 564 op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi); |
| 565 |
| 566 f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo); |
| 567 f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi); |
| 568 op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi); |
| 569 |
| 570 f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo); |
| 571 f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi); |
| 572 op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi); |
| 573 |
| 574 f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo); |
| 575 f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi); |
| 576 oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi); |
| 577 |
| 578 f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo); |
| 579 f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi); |
| 580 oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi); |
| 581 |
| 582 f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo); |
| 583 f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi); |
| 584 oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi); |
| 561 } | 585 } |
| 562 | 586 |
| 563 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 587 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 564 // flat and wide flat calculations | 588 // wide flat calculations |
| 565 { | 589 { |
| 566 const __m128i eight = _mm_set1_epi16(8); | 590 const __m128i eight = _mm_set1_epi16(8); |
| 567 const __m128i four = _mm_set1_epi16(4); | 591 const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero); |
| 568 __m128i temp_flat2 = flat2; | 592 const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero); |
| 569 unsigned char *src = s; | 593 const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero); |
| 570 int i = 0; | 594 const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero); |
| 571 do { | 595 const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero); |
| 572 __m128i workp_shft; | 596 const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero); |
| 573 __m128i a, b, c; | 597 const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero); |
| 598 const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero); |
| 599 const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero); |
| 600 const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero); |
| 601 const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero); |
| 602 const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero); |
| 603 const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero); |
| 604 const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero); |
| 605 const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero); |
| 606 const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero); |
| 574 | 607 |
| 575 unsigned int off = i * 8; | 608 const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero); |
| 576 p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[7 * 16] + off)), | 609 const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero); |
| 577 zero); | 610 const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero); |
| 578 p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[6 * 16] + off)), | 611 const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero); |
| 579 zero); | 612 const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero); |
| 580 p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[5 * 16] + off)), | 613 const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero); |
| 581 zero); | 614 const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero); |
| 582 p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[4 * 16] + off)), | 615 const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero); |
| 583 zero); | 616 const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero); |
| 584 p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[3 * 16] + off)), | 617 const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero); |
| 585 zero); | 618 const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero); |
| 586 p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[2 * 16] + off)), | 619 const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero); |
| 587 zero); | 620 const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero); |
| 588 p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[1 * 16] + off)), | 621 const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero); |
| 589 zero); | 622 const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero); |
| 590 p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[0 * 16] + off)), | 623 const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero); |
| 591 zero); | |
| 592 q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[0 * 16] + off)), | |
| 593 zero); | |
| 594 q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[1 * 16] + off)), | |
| 595 zero); | |
| 596 q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[2 * 16] + off)), | |
| 597 zero); | |
| 598 q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[3 * 16] + off)), | |
| 599 zero); | |
| 600 q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[4 * 16] + off)), | |
| 601 zero); | |
| 602 q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[5 * 16] + off)), | |
| 603 zero); | |
| 604 q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[6 * 16] + off)), | |
| 605 zero); | |
| 606 q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[7 * 16] + off)), | |
| 607 zero); | |
| 608 | 624 |
| 609 c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7); // p7 * 7 | 625 __m128i f_lo; |
| 610 c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c)); | 626 __m128i f_hi; |
| 611 | 627 |
| 612 b = _mm_add_epi16(_mm_add_epi16(p3, four), _mm_add_epi16(p3, p2)); | 628 f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo); // p7 * 7 |
| 613 a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1)); | 629 f_lo = _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), |
| 614 a = _mm_add_epi16(_mm_add_epi16(p0, q0), a); | 630 _mm_add_epi16(p4_lo, f_lo)); |
| 631 f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo), |
| 632 _mm_add_epi16(p2_lo, p1_lo)); |
| 633 f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo); |
| 634 f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo); |
| 615 | 635 |
| 616 _mm_storel_epi64((__m128i *)&flat_op[2 * 16 + i * 8], | 636 f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi); // p7 * 7 |
| 617 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) | 637 f_hi = _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), |
| 618 , b)); | 638 _mm_add_epi16(p4_hi, f_hi)); |
| 639 f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi), |
| 640 _mm_add_epi16(p2_hi, p1_hi)); |
| 641 f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi); |
| 642 f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi); |
| 619 | 643 |
| 620 c = _mm_add_epi16(_mm_add_epi16(p5, eight), c); | 644 p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi); |
| 621 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); | 645 _mm_storeu_si128((__m128i *)(s - 7 * p), p6); |
| 622 _mm_storel_epi64((__m128i *)&flat2_op[6 * 16 + i * 8], | |
| 623 _mm_packus_epi16(workp_shft, workp_shft)); | |
| 624 | 646 |
| 625 a = _mm_add_epi16(q1, a); | 647 f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo); |
| 626 b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1); | 648 f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi); |
| 627 _mm_storel_epi64((__m128i *)&flat_op[1 * 16 + i * 8], | 649 p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi); |
| 628 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) | 650 _mm_storeu_si128((__m128i *)(s - 6 * p), p5); |
| 629 , b)); | |
| 630 | 651 |
| 631 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5); | 652 f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo); |
| 632 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); | 653 f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi); |
| 633 _mm_storel_epi64((__m128i *)&flat2_op[5 * 16 + i * 8], | 654 p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi); |
| 634 _mm_packus_epi16(workp_shft, workp_shft)); | 655 _mm_storeu_si128((__m128i *)(s - 5 * p), p4); |
| 635 | 656 |
| 636 a = _mm_add_epi16(q2, a); | 657 f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo); |
| 637 b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0); | 658 f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi); |
| 638 _mm_storel_epi64((__m128i *)&flat_op[i * 8], | 659 p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi); |
| 639 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) | 660 _mm_storeu_si128((__m128i *)(s - 4 * p), p3); |
| 640 , b)); | |
| 641 | 661 |
| 642 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4); | 662 f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo); |
| 643 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); | 663 f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi); |
| 644 _mm_storel_epi64((__m128i *)&flat2_op[4 * 16 + i * 8], | 664 op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi); |
| 645 _mm_packus_epi16(workp_shft, workp_shft)); | 665 _mm_storeu_si128((__m128i *)(s - 3 * p), op2); |
| 646 | 666 |
| 647 a = _mm_add_epi16(q3, a); | 667 f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo); |
| 648 b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0); | 668 f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi); |
| 649 _mm_storel_epi64((__m128i *)&flat_oq[i * 8], | 669 op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi); |
| 650 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) | 670 _mm_storeu_si128((__m128i *)(s - 2 * p), op1); |
| 651 , b)); | |
| 652 | 671 |
| 653 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3); | 672 f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo); |
| 654 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); | 673 f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi); |
| 655 _mm_storel_epi64((__m128i *)&flat2_op[3 * 16 + i * 8], | 674 op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi); |
| 656 _mm_packus_epi16(workp_shft, workp_shft)); | 675 _mm_storeu_si128((__m128i *)(s - 1 * p), op0); |
| 657 | 676 |
| 658 b = _mm_add_epi16(q3, b); | 677 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo); |
| 659 b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1); | 678 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi); |
| 660 _mm_storel_epi64((__m128i *)&flat_oq[16 + i * 8], | 679 oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi); |
| 661 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) | 680 _mm_storeu_si128((__m128i *)(s - 0 * p), oq0); |
| 662 , b)); | |
| 663 | 681 |
| 664 c = _mm_add_epi16(q4, c); | 682 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo); |
| 665 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2); | 683 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi); |
| 666 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); | 684 oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi); |
| 667 _mm_storel_epi64((__m128i *)&flat2_op[2 * 16 + i * 8], | 685 _mm_storeu_si128((__m128i *)(s + 1 * p), oq1); |
| 668 _mm_packus_epi16(workp_shft, workp_shft)); | |
| 669 | 686 |
| 670 b = _mm_add_epi16(q3, b); | 687 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo); |
| 671 b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2); | 688 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi); |
| 672 _mm_storel_epi64((__m128i *)&flat_oq[2 * 16 + i * 8], | 689 oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi); |
| 673 _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) | 690 _mm_storeu_si128((__m128i *)(s + 2 * p), oq2); |
| 674 , b)); | |
| 675 a = _mm_add_epi16(q5, a); | |
| 676 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1); | |
| 677 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); | |
| 678 _mm_storel_epi64((__m128i *)&flat2_op[16 + i * 8], | |
| 679 _mm_packus_epi16(workp_shft, workp_shft)); | |
| 680 | 691 |
| 681 a = _mm_add_epi16(q6, a); | 692 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo); |
| 682 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0); | 693 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi); |
| 683 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); | 694 q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi); |
| 684 _mm_storel_epi64((__m128i *)&flat2_op[i * 8], | 695 _mm_storeu_si128((__m128i *)(s + 3 * p), q3); |
| 685 _mm_packus_epi16(workp_shft, workp_shft)); | |
| 686 | 696 |
| 687 a = _mm_add_epi16(q7, a); | 697 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo); |
| 688 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0); | 698 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi); |
| 689 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); | 699 q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi); |
| 690 _mm_storel_epi64((__m128i *)&flat2_oq[i * 8], | 700 _mm_storeu_si128((__m128i *)(s + 4 * p), q4); |
| 691 _mm_packus_epi16(workp_shft, workp_shft)); | |
| 692 | 701 |
| 693 a = _mm_add_epi16(q7, a); | 702 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo); |
| 694 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1); | 703 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi); |
| 695 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); | 704 q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi); |
| 696 _mm_storel_epi64((__m128i *)&flat2_oq[16 + i * 8], | 705 _mm_storeu_si128((__m128i *)(s + 5 * p), q5); |
| 697 _mm_packus_epi16(workp_shft, workp_shft)); | |
| 698 | 706 |
| 699 a = _mm_add_epi16(q7, a); | 707 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo); |
| 700 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2); | 708 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi); |
| 701 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); | 709 q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi); |
| 702 _mm_storel_epi64((__m128i *)&flat2_oq[2 * 16 + i * 8], | 710 _mm_storeu_si128((__m128i *)(s + 6 * p), q6); |
| 703 _mm_packus_epi16(workp_shft, workp_shft)); | |
| 704 | |
| 705 a = _mm_add_epi16(q7, a); | |
| 706 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3); | |
| 707 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); | |
| 708 _mm_storel_epi64((__m128i *)&flat2_oq[3 * 16 + i * 8], | |
| 709 _mm_packus_epi16(workp_shft, workp_shft)); | |
| 710 | |
| 711 a = _mm_add_epi16(q7, a); | |
| 712 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4); | |
| 713 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); | |
| 714 _mm_storel_epi64((__m128i *)&flat2_oq[4 * 16 + i * 8], | |
| 715 _mm_packus_epi16(workp_shft, workp_shft)); | |
| 716 | |
| 717 a = _mm_add_epi16(q7, a); | |
| 718 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5); | |
| 719 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); | |
| 720 _mm_storel_epi64((__m128i *)&flat2_oq[5 * 16 + i * 8], | |
| 721 _mm_packus_epi16(workp_shft, workp_shft)); | |
| 722 | |
| 723 a = _mm_add_epi16(q7, a); | |
| 724 c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6); | |
| 725 workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); | |
| 726 _mm_storel_epi64((__m128i *)&flat2_oq[6 * 16 + i * 8], | |
| 727 _mm_packus_epi16(workp_shft, workp_shft)); | |
| 728 | |
| 729 temp_flat2 = _mm_srli_si128(temp_flat2, 8); | |
| 730 src += 8; | |
| 731 } while (++i < 2); | |
| 732 } | 711 } |
| 733 // wide flat | 712 // wide flat |
| 734 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 713 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 735 | |
| 736 work_a = _mm_load_si128((__m128i *)&ap[2 * 16]); | |
| 737 p2 = _mm_load_si128((__m128i *)&flat_op[2 * 16]); | |
| 738 work_a = _mm_andnot_si128(flat, work_a); | |
| 739 p2 = _mm_and_si128(flat, p2); | |
| 740 p2 = _mm_or_si128(work_a, p2); | |
| 741 _mm_store_si128((__m128i *)&flat_op[2 * 16], p2); | |
| 742 | |
| 743 p1 = _mm_load_si128((__m128i *)&flat_op[1 * 16]); | |
| 744 work_a = _mm_andnot_si128(flat, ps1); | |
| 745 p1 = _mm_and_si128(flat, p1); | |
| 746 p1 = _mm_or_si128(work_a, p1); | |
| 747 _mm_store_si128((__m128i *)&flat_op[1 * 16], p1); | |
| 748 | |
| 749 p0 = _mm_load_si128((__m128i *)&flat_op[0]); | |
| 750 work_a = _mm_andnot_si128(flat, ps0); | |
| 751 p0 = _mm_and_si128(flat, p0); | |
| 752 p0 = _mm_or_si128(work_a, p0); | |
| 753 _mm_store_si128((__m128i *)&flat_op[0], p0); | |
| 754 | |
| 755 q0 = _mm_load_si128((__m128i *)&flat_oq[0]); | |
| 756 work_a = _mm_andnot_si128(flat, qs0); | |
| 757 q0 = _mm_and_si128(flat, q0); | |
| 758 q0 = _mm_or_si128(work_a, q0); | |
| 759 _mm_store_si128((__m128i *)&flat_oq[0], q0); | |
| 760 | |
| 761 q1 = _mm_load_si128((__m128i *)&flat_oq[1 * 16]); | |
| 762 work_a = _mm_andnot_si128(flat, qs1); | |
| 763 q1 = _mm_and_si128(flat, q1); | |
| 764 q1 = _mm_or_si128(work_a, q1); | |
| 765 _mm_store_si128((__m128i *)&flat_oq[1 * 16], q1); | |
| 766 | |
| 767 work_a = _mm_load_si128((__m128i *)&aq[2 * 16]); | |
| 768 q2 = _mm_load_si128((__m128i *)&flat_oq[2 * 16]); | |
| 769 work_a = _mm_andnot_si128(flat, work_a); | |
| 770 q2 = _mm_and_si128(flat, q2); | |
| 771 q2 = _mm_or_si128(work_a, q2); | |
| 772 _mm_store_si128((__m128i *)&flat_oq[2 * 16], q2); | |
| 773 | |
| 774 // write out op6 - op3 | |
| 775 { | |
| 776 unsigned char *dst = (s - 7 * p); | |
| 777 for (i = 6; i > 2; i--) { | |
| 778 __m128i flat2_output; | |
| 779 work_a = _mm_load_si128((__m128i *)&ap[i * 16]); | |
| 780 flat2_output = _mm_load_si128((__m128i *)&flat2_op[i * 16]); | |
| 781 work_a = _mm_andnot_si128(flat2, work_a); | |
| 782 flat2_output = _mm_and_si128(flat2, flat2_output); | |
| 783 work_a = _mm_or_si128(work_a, flat2_output); | |
| 784 _mm_storeu_si128((__m128i *)dst, work_a); | |
| 785 dst += p; | |
| 786 } | |
| 787 } | |
| 788 | |
| 789 work_a = _mm_load_si128((__m128i *)&flat_op[2 * 16]); | |
| 790 p2 = _mm_load_si128((__m128i *)&flat2_op[2 * 16]); | |
| 791 work_a = _mm_andnot_si128(flat2, work_a); | |
| 792 p2 = _mm_and_si128(flat2, p2); | |
| 793 p2 = _mm_or_si128(work_a, p2); | |
| 794 _mm_storeu_si128((__m128i *)(s - 3 * p), p2); | |
| 795 | |
| 796 work_a = _mm_load_si128((__m128i *)&flat_op[1 * 16]); | |
| 797 p1 = _mm_load_si128((__m128i *)&flat2_op[1 * 16]); | |
| 798 work_a = _mm_andnot_si128(flat2, work_a); | |
| 799 p1 = _mm_and_si128(flat2, p1); | |
| 800 p1 = _mm_or_si128(work_a, p1); | |
| 801 _mm_storeu_si128((__m128i *)(s - 2 * p), p1); | |
| 802 | |
| 803 work_a = _mm_load_si128((__m128i *)&flat_op[0]); | |
| 804 p0 = _mm_load_si128((__m128i *)&flat2_op[0]); | |
| 805 work_a = _mm_andnot_si128(flat2, work_a); | |
| 806 p0 = _mm_and_si128(flat2, p0); | |
| 807 p0 = _mm_or_si128(work_a, p0); | |
| 808 _mm_storeu_si128((__m128i *)(s - 1 * p), p0); | |
| 809 | |
| 810 work_a = _mm_load_si128((__m128i *)&flat_oq[0]); | |
| 811 q0 = _mm_load_si128((__m128i *)&flat2_oq[0]); | |
| 812 work_a = _mm_andnot_si128(flat2, work_a); | |
| 813 q0 = _mm_and_si128(flat2, q0); | |
| 814 q0 = _mm_or_si128(work_a, q0); | |
| 815 _mm_storeu_si128((__m128i *)(s - 0 * p), q0); | |
| 816 | |
| 817 work_a = _mm_load_si128((__m128i *)&flat_oq[1 * 16]); | |
| 818 q1 = _mm_load_si128((__m128i *)&flat2_oq[16]); | |
| 819 work_a = _mm_andnot_si128(flat2, work_a); | |
| 820 q1 = _mm_and_si128(flat2, q1); | |
| 821 q1 = _mm_or_si128(work_a, q1); | |
| 822 _mm_storeu_si128((__m128i *)(s + 1 * p), q1); | |
| 823 | |
| 824 work_a = _mm_load_si128((__m128i *)&flat_oq[2 * 16]); | |
| 825 q2 = _mm_load_si128((__m128i *)&flat2_oq[2 * 16]); | |
| 826 work_a = _mm_andnot_si128(flat2, work_a); | |
| 827 q2 = _mm_and_si128(flat2, q2); | |
| 828 q2 = _mm_or_si128(work_a, q2); | |
| 829 _mm_storeu_si128((__m128i *)(s + 2 * p), q2); | |
| 830 | |
| 831 // write out oq3 - oq7 | |
| 832 { | |
| 833 unsigned char *dst = (s + 3 * p); | |
| 834 for (i = 3; i < 7; i++) { | |
| 835 __m128i flat2_output; | |
| 836 work_a = _mm_load_si128((__m128i *)&aq[i * 16]); | |
| 837 flat2_output = _mm_load_si128((__m128i *)&flat2_oq[i * 16]); | |
| 838 work_a = _mm_andnot_si128(flat2, work_a); | |
| 839 flat2_output = _mm_and_si128(flat2, flat2_output); | |
| 840 work_a = _mm_or_si128(work_a, flat2_output); | |
| 841 _mm_storeu_si128((__m128i *)dst, work_a); | |
| 842 dst += p; | |
| 843 } | |
| 844 } | |
| 845 } | 714 } |
| 846 } | 715 } |
| 847 | 716 |
| 848 // TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly. | 717 // TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly. |
| 849 void vp9_lpf_horizontal_16_sse2(unsigned char *s, int p, | 718 void vp9_lpf_horizontal_16_sse2(unsigned char *s, int p, |
| 850 const unsigned char *_blimit, | 719 const unsigned char *_blimit, |
| 851 const unsigned char *_limit, | 720 const unsigned char *_limit, |
| 852 const unsigned char *_thresh, int count) { | 721 const unsigned char *_thresh, int count) { |
| 853 if (count == 1) | 722 if (count == 1) |
| 854 mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh); | 723 mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh); |
| (...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 886 _mm_loadl_epi64((__m128i *)(s - 0 * p))); | 755 _mm_loadl_epi64((__m128i *)(s - 0 * p))); |
| 887 p1q1 = _mm_shuffle_epi32(q1p1, 78); | 756 p1q1 = _mm_shuffle_epi32(q1p1, 78); |
| 888 p0q0 = _mm_shuffle_epi32(q0p0, 78); | 757 p0q0 = _mm_shuffle_epi32(q0p0, 78); |
| 889 | 758 |
| 890 { | 759 { |
| 891 // filter_mask and hev_mask | 760 // filter_mask and hev_mask |
| 892 const __m128i one = _mm_set1_epi8(1); | 761 const __m128i one = _mm_set1_epi8(1); |
| 893 const __m128i fe = _mm_set1_epi8(0xfe); | 762 const __m128i fe = _mm_set1_epi8(0xfe); |
| 894 const __m128i ff = _mm_cmpeq_epi8(fe, fe); | 763 const __m128i ff = _mm_cmpeq_epi8(fe, fe); |
| 895 __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; | 764 __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; |
| 896 abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), | 765 abs_p1p0 = abs_diff(q1p1, q0p0); |
| 897 _mm_subs_epu8(q0p0, q1p1)); | |
| 898 abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); | 766 abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); |
| 899 | 767 |
| 900 abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), | 768 abs_p0q0 = abs_diff(q0p0, p0q0); |
| 901 _mm_subs_epu8(p0q0, q0p0)); | 769 abs_p1q1 = abs_diff(q1p1, p1q1); |
| 902 abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), | |
| 903 _mm_subs_epu8(p1q1, q1p1)); | |
| 904 flat = _mm_max_epu8(abs_p1p0, abs_q1q0); | 770 flat = _mm_max_epu8(abs_p1p0, abs_q1q0); |
| 905 hev = _mm_subs_epu8(flat, thresh); | 771 hev = _mm_subs_epu8(flat, thresh); |
| 906 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); | 772 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); |
| 907 | 773 |
| 908 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); | 774 abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); |
| 909 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); | 775 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); |
| 910 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); | 776 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); |
| 911 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); | 777 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); |
| 912 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; | 778 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; |
| 913 mask = _mm_max_epu8(abs_p1p0, mask); | 779 mask = _mm_max_epu8(abs_p1p0, mask); |
| 914 // mask |= (abs(p1 - p0) > limit) * -1; | 780 // mask |= (abs(p1 - p0) > limit) * -1; |
| 915 // mask |= (abs(q1 - q0) > limit) * -1; | 781 // mask |= (abs(q1 - q0) > limit) * -1; |
| 916 | 782 |
| 917 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q1p1), | 783 work = _mm_max_epu8(abs_diff(q2p2, q1p1), |
| 918 _mm_subs_epu8(q1p1, q2p2)), | 784 abs_diff(q3p3, q2p2)); |
| 919 _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), | |
| 920 _mm_subs_epu8(q2p2, q3p3))); | |
| 921 mask = _mm_max_epu8(work, mask); | 785 mask = _mm_max_epu8(work, mask); |
| 922 mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); | 786 mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); |
| 923 mask = _mm_subs_epu8(mask, limit); | 787 mask = _mm_subs_epu8(mask, limit); |
| 924 mask = _mm_cmpeq_epi8(mask, zero); | 788 mask = _mm_cmpeq_epi8(mask, zero); |
| 925 | 789 |
| 926 // flat_mask4 | 790 // flat_mask4 |
| 927 | 791 |
| 928 flat = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q0p0), | 792 flat = _mm_max_epu8(abs_diff(q2p2, q0p0), |
| 929 _mm_subs_epu8(q0p0, q2p2)), | 793 abs_diff(q3p3, q0p0)); |
| 930 _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), | |
| 931 _mm_subs_epu8(q0p0, q3p3))); | |
| 932 flat = _mm_max_epu8(abs_p1p0, flat); | 794 flat = _mm_max_epu8(abs_p1p0, flat); |
| 933 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); | 795 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); |
| 934 flat = _mm_subs_epu8(flat, one); | 796 flat = _mm_subs_epu8(flat, one); |
| 935 flat = _mm_cmpeq_epi8(flat, zero); | 797 flat = _mm_cmpeq_epi8(flat, zero); |
| 936 flat = _mm_and_si128(flat, mask); | 798 flat = _mm_and_si128(flat, mask); |
| 937 } | 799 } |
| 938 | 800 |
| 939 { | 801 { |
| 940 const __m128i four = _mm_set1_epi16(4); | 802 const __m128i four = _mm_set1_epi16(4); |
| 941 unsigned char *src = s; | 803 unsigned char *src = s; |
| (...skipping 765 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1707 transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); | 1569 transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); |
| 1708 | 1570 |
| 1709 // Loop filtering | 1571 // Loop filtering |
| 1710 mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit, | 1572 mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit, |
| 1711 thresh); | 1573 thresh); |
| 1712 | 1574 |
| 1713 // Transpose back | 1575 // Transpose back |
| 1714 transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); | 1576 transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); |
| 1715 transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); | 1577 transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); |
| 1716 } | 1578 } |
| OLD | NEW |