Index: source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c |
diff --git a/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c |
index 4c3495b056033380e5c99f27914b6f4e36362abb..0a105629f067478a5754101879b38364e07af345 100644 |
--- a/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c |
+++ b/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c |
@@ -56,3 +56,117 @@ unsigned int vp9_avg_4x4_sse2(const uint8_t *s, int p) { |
avg = _mm_extract_epi16(s0, 0); |
return (avg + 8) >> 4; |
} |
+ |
+void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref, |
+ const int ref_stride, const int height) { |
+ int idx; |
+ __m128i zero = _mm_setzero_si128(); |
+ __m128i src_line = _mm_load_si128((const __m128i *)ref); |
+ __m128i s0 = _mm_unpacklo_epi8(src_line, zero); |
+ __m128i s1 = _mm_unpackhi_epi8(src_line, zero); |
+ __m128i t0, t1; |
+ int height_1 = height - 1; |
+ ref += ref_stride; |
+ |
+ for (idx = 1; idx < height_1; idx += 2) { |
+ src_line = _mm_load_si128((const __m128i *)ref); |
+ t0 = _mm_unpacklo_epi8(src_line, zero); |
+ t1 = _mm_unpackhi_epi8(src_line, zero); |
+ s0 = _mm_adds_epu16(s0, t0); |
+ s1 = _mm_adds_epu16(s1, t1); |
+ ref += ref_stride; |
+ |
+ src_line = _mm_load_si128((const __m128i *)ref); |
+ t0 = _mm_unpacklo_epi8(src_line, zero); |
+ t1 = _mm_unpackhi_epi8(src_line, zero); |
+ s0 = _mm_adds_epu16(s0, t0); |
+ s1 = _mm_adds_epu16(s1, t1); |
+ ref += ref_stride; |
+ } |
+ |
+ src_line = _mm_load_si128((const __m128i *)ref); |
+ t0 = _mm_unpacklo_epi8(src_line, zero); |
+ t1 = _mm_unpackhi_epi8(src_line, zero); |
+ s0 = _mm_adds_epu16(s0, t0); |
+ s1 = _mm_adds_epu16(s1, t1); |
+ |
+ _mm_store_si128((__m128i *)hbuf, s0); |
+ hbuf += 8; |
+ _mm_store_si128((__m128i *)hbuf, s1); |
+} |
+ |
+int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) { |
+ __m128i zero = _mm_setzero_si128(); |
+ __m128i src_line = _mm_load_si128((const __m128i *)ref); |
+ __m128i s0 = _mm_sad_epu8(src_line, zero); |
+ __m128i s1; |
+ (void) width; // width = 64 |
+ |
+ ref += 16; |
+ src_line = _mm_load_si128((const __m128i *)ref); |
+ s1 = _mm_sad_epu8(src_line, zero); |
+ s0 = _mm_adds_epu16(s0, s1); |
+ |
+ ref += 16; |
+ src_line = _mm_load_si128((const __m128i *)ref); |
+ s1 = _mm_sad_epu8(src_line, zero); |
+ s0 = _mm_adds_epu16(s0, s1); |
+ |
+ ref += 16; |
+ src_line = _mm_load_si128((const __m128i *)ref); |
+ s1 = _mm_sad_epu8(src_line, zero); |
+ s0 = _mm_adds_epu16(s0, s1); |
+ |
+ s1 = _mm_srli_si128(s0, 8); |
+ s0 = _mm_adds_epu16(s0, s1); |
+ |
+ return _mm_extract_epi16(s0, 0); |
+} |
+ |
+int vp9_vector_sad_sse2(int16_t const *ref, int16_t const *src, |
+ const int width) { |
+ int idx; |
+ __m128i zero = _mm_setzero_si128(); |
+ __m128i sum; |
+ __m128i v0 = _mm_loadu_si128((const __m128i *)ref); |
+ __m128i v1 = _mm_load_si128((const __m128i *)src); |
+ __m128i diff = _mm_subs_epi16(v0, v1); |
+ __m128i sign = _mm_srai_epi16(diff, 15); |
+ |
+ diff = _mm_xor_si128(diff, sign); |
+ sum = _mm_sub_epi16(diff, sign); |
+ |
+ (void) width; // width = 64; |
+ |
+ ref += 8; |
+ src += 8; |
+ |
+ v0 = _mm_unpacklo_epi16(sum, zero); |
+ v1 = _mm_unpackhi_epi16(sum, zero); |
+ sum = _mm_add_epi32(v0, v1); |
+ |
+ for (idx = 1; idx < 8; ++idx) { |
+ v0 = _mm_loadu_si128((const __m128i *)ref); |
+ v1 = _mm_load_si128((const __m128i *)src); |
+ diff = _mm_subs_epi16(v0, v1); |
+ sign = _mm_srai_epi16(diff, 15); |
+ diff = _mm_xor_si128(diff, sign); |
+ diff = _mm_sub_epi16(diff, sign); |
+ |
+ v0 = _mm_unpacklo_epi16(diff, zero); |
+ v1 = _mm_unpackhi_epi16(diff, zero); |
+ |
+ sum = _mm_add_epi32(sum, v0); |
+ sum = _mm_add_epi32(sum, v1); |
+ |
+ ref += 8; |
+ src += 8; |
+ } |
+ |
+ v0 = _mm_srli_si128(sum, 8); |
+ sum = _mm_add_epi32(sum, v0); |
+ v0 = _mm_srli_epi64(sum, 32); |
+ sum = _mm_add_epi32(sum, v0); |
+ |
+ return _mm_cvtsi128_si32(sum); |
+} |