Index: source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c |
=================================================================== |
--- source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c (revision 293081) |
+++ source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c (working copy) |
@@ -9,6 +9,7 @@ |
*/ |
#include <immintrin.h> /* AVX2 */ |
+#include "vpx_ports/mem.h" |
static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p, |
const unsigned char *_blimit, const unsigned char *_limit, |
@@ -392,6 +393,11 @@ |
} |
} |
+DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = { |
+ 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128, |
+ 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128 |
+}; |
+ |
static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p, |
const unsigned char *_blimit, const unsigned char *_limit, |
const unsigned char *_thresh) { |
@@ -401,6 +407,9 @@ |
__m128i p7, p6, p5; |
__m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; |
__m128i q5, q6, q7; |
+ __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, |
+ q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, |
+ p256_0, q256_0; |
const __m128i thresh = _mm_broadcastb_epi8( |
_mm_cvtsi32_si128((int) _thresh[0])); |
@@ -409,17 +418,38 @@ |
const __m128i blimit = _mm_broadcastb_epi8( |
_mm_cvtsi32_si128((int) _blimit[0])); |
- p4 = _mm_loadu_si128((__m128i *) (s - 5 * p)); |
- p3 = _mm_loadu_si128((__m128i *) (s - 4 * p)); |
- p2 = _mm_loadu_si128((__m128i *) (s - 3 * p)); |
- p1 = _mm_loadu_si128((__m128i *) (s - 2 * p)); |
- p0 = _mm_loadu_si128((__m128i *) (s - 1 * p)); |
- q0 = _mm_loadu_si128((__m128i *) (s - 0 * p)); |
- q1 = _mm_loadu_si128((__m128i *) (s + 1 * p)); |
- q2 = _mm_loadu_si128((__m128i *) (s + 2 * p)); |
- q3 = _mm_loadu_si128((__m128i *) (s + 3 * p)); |
- q4 = _mm_loadu_si128((__m128i *) (s + 4 * p)); |
+ p256_4 = _mm256_castpd_si256(_mm256_broadcast_pd( |
+ (__m128d const *)(s - 5 * p))); |
+ p256_3 = _mm256_castpd_si256(_mm256_broadcast_pd( |
+ (__m128d const *)(s - 4 * p))); |
+ p256_2 = _mm256_castpd_si256(_mm256_broadcast_pd( |
+ (__m128d const *)(s - 3 * p))); |
+ p256_1 = _mm256_castpd_si256(_mm256_broadcast_pd( |
+ (__m128d const *)(s - 2 * p))); |
+ p256_0 = _mm256_castpd_si256(_mm256_broadcast_pd( |
+ (__m128d const *)(s - 1 * p))); |
+ q256_0 = _mm256_castpd_si256(_mm256_broadcast_pd( |
+ (__m128d const *)(s - 0 * p))); |
+ q256_1 = _mm256_castpd_si256(_mm256_broadcast_pd( |
+ (__m128d const *)(s + 1 * p))); |
+ q256_2 = _mm256_castpd_si256(_mm256_broadcast_pd( |
+ (__m128d const *)(s + 2 * p))); |
+ q256_3 = _mm256_castpd_si256(_mm256_broadcast_pd( |
+ (__m128d const *)(s + 3 * p))); |
+ q256_4 = _mm256_castpd_si256(_mm256_broadcast_pd( |
+ (__m128d const *)(s + 4 * p))); |
+ p4 = _mm256_castsi256_si128(p256_4); |
+ p3 = _mm256_castsi256_si128(p256_3); |
+ p2 = _mm256_castsi256_si128(p256_2); |
+ p1 = _mm256_castsi256_si128(p256_1); |
+ p0 = _mm256_castsi256_si128(p256_0); |
+ q0 = _mm256_castsi256_si128(q256_0); |
+ q1 = _mm256_castsi256_si128(q256_1); |
+ q2 = _mm256_castsi256_si128(q256_2); |
+ q3 = _mm256_castsi256_si128(q256_3); |
+ q4 = _mm256_castsi256_si128(q256_4); |
+ |
{ |
const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), |
_mm_subs_epu8(p0, p1)); |
@@ -534,15 +564,23 @@ |
flat = _mm_cmpeq_epi8(flat, zero); |
flat = _mm_and_si128(flat, mask); |
- p5 = _mm_loadu_si128((__m128i *) (s - 6 * p)); |
- q5 = _mm_loadu_si128((__m128i *) (s + 5 * p)); |
+ p256_5 = _mm256_castpd_si256(_mm256_broadcast_pd( |
+ (__m128d const *)(s - 6 * p))); |
+ q256_5 = _mm256_castpd_si256(_mm256_broadcast_pd( |
+ (__m128d const *)(s + 5 * p))); |
+ p5 = _mm256_castsi256_si128(p256_5); |
+ q5 = _mm256_castsi256_si128(q256_5); |
flat2 = _mm_max_epu8( |
_mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)), |
_mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5))); |
flat2 = _mm_max_epu8(work, flat2); |
- p6 = _mm_loadu_si128((__m128i *) (s - 7 * p)); |
- q6 = _mm_loadu_si128((__m128i *) (s + 6 * p)); |
+ p256_6 = _mm256_castpd_si256(_mm256_broadcast_pd( |
+ (__m128d const *)(s - 7 * p))); |
+ q256_6 = _mm256_castpd_si256(_mm256_broadcast_pd( |
+ (__m128d const *)(s + 6 * p))); |
+ p6 = _mm256_castsi256_si128(p256_6); |
+ q6 = _mm256_castsi256_si128(q256_6); |
work = _mm_max_epu8( |
_mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)), |
_mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6))); |
@@ -549,8 +587,12 @@ |
flat2 = _mm_max_epu8(work, flat2); |
- p7 = _mm_loadu_si128((__m128i *) (s - 8 * p)); |
- q7 = _mm_loadu_si128((__m128i *) (s + 7 * p)); |
+ p256_7 = _mm256_castpd_si256(_mm256_broadcast_pd( |
+ (__m128d const *)(s - 8 * p))); |
+ q256_7 = _mm256_castpd_si256(_mm256_broadcast_pd( |
+ (__m128d const *)(s + 7 * p))); |
+ p7 = _mm256_castsi256_si128(p256_7); |
+ q7 = _mm256_castsi256_si128(q256_7); |
work = _mm_max_epu8( |
_mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)), |
_mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7))); |
@@ -566,29 +608,28 @@ |
{ |
const __m256i eight = _mm256_set1_epi16(8); |
const __m256i four = _mm256_set1_epi16(4); |
- __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, |
- q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, |
- p256_0, q256_0; |
__m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0, |
pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, |
res_q; |
- p256_7 = _mm256_cvtepu8_epi16(p7); |
- p256_6 = _mm256_cvtepu8_epi16(p6); |
- p256_5 = _mm256_cvtepu8_epi16(p5); |
- p256_4 = _mm256_cvtepu8_epi16(p4); |
- p256_3 = _mm256_cvtepu8_epi16(p3); |
- p256_2 = _mm256_cvtepu8_epi16(p2); |
- p256_1 = _mm256_cvtepu8_epi16(p1); |
- p256_0 = _mm256_cvtepu8_epi16(p0); |
- q256_0 = _mm256_cvtepu8_epi16(q0); |
- q256_1 = _mm256_cvtepu8_epi16(q1); |
- q256_2 = _mm256_cvtepu8_epi16(q2); |
- q256_3 = _mm256_cvtepu8_epi16(q3); |
- q256_4 = _mm256_cvtepu8_epi16(q4); |
- q256_5 = _mm256_cvtepu8_epi16(q5); |
- q256_6 = _mm256_cvtepu8_epi16(q6); |
- q256_7 = _mm256_cvtepu8_epi16(q7); |
+ const __m256i filter = _mm256_load_si256( |
+ (__m256i const *)filt_loopfilter_avx2); |
+ p256_7 = _mm256_shuffle_epi8(p256_7, filter); |
+ p256_6 = _mm256_shuffle_epi8(p256_6, filter); |
+ p256_5 = _mm256_shuffle_epi8(p256_5, filter); |
+ p256_4 = _mm256_shuffle_epi8(p256_4, filter); |
+ p256_3 = _mm256_shuffle_epi8(p256_3, filter); |
+ p256_2 = _mm256_shuffle_epi8(p256_2, filter); |
+ p256_1 = _mm256_shuffle_epi8(p256_1, filter); |
+ p256_0 = _mm256_shuffle_epi8(p256_0, filter); |
+ q256_0 = _mm256_shuffle_epi8(q256_0, filter); |
+ q256_1 = _mm256_shuffle_epi8(q256_1, filter); |
+ q256_2 = _mm256_shuffle_epi8(q256_2, filter); |
+ q256_3 = _mm256_shuffle_epi8(q256_3, filter); |
+ q256_4 = _mm256_shuffle_epi8(q256_4, filter); |
+ q256_5 = _mm256_shuffle_epi8(q256_5, filter); |
+ q256_6 = _mm256_shuffle_epi8(q256_6, filter); |
+ q256_7 = _mm256_shuffle_epi8(q256_7, filter); |
pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5), |
_mm256_add_epi16(p256_4, p256_3)); |