| Index: source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c
|
| diff --git a/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c
|
| index 4c3495b056033380e5c99f27914b6f4e36362abb..0a105629f067478a5754101879b38364e07af345 100644
|
| --- a/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c
|
| +++ b/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c
|
| @@ -56,3 +56,117 @@ unsigned int vp9_avg_4x4_sse2(const uint8_t *s, int p) {
|
| avg = _mm_extract_epi16(s0, 0);
|
| return (avg + 8) >> 4;
|
| }
|
| +
|
| +void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
|
| + const int ref_stride, const int height) {
|
| + int idx;
|
| + __m128i zero = _mm_setzero_si128();
|
| + __m128i src_line = _mm_load_si128((const __m128i *)ref);
|
| + __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
|
| + __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
|
| + __m128i t0, t1;
|
| + int height_1 = height - 1;
|
| + ref += ref_stride;
|
| +
|
| + for (idx = 1; idx < height_1; idx += 2) {
|
| + src_line = _mm_load_si128((const __m128i *)ref);
|
| + t0 = _mm_unpacklo_epi8(src_line, zero);
|
| + t1 = _mm_unpackhi_epi8(src_line, zero);
|
| + s0 = _mm_adds_epu16(s0, t0);
|
| + s1 = _mm_adds_epu16(s1, t1);
|
| + ref += ref_stride;
|
| +
|
| + src_line = _mm_load_si128((const __m128i *)ref);
|
| + t0 = _mm_unpacklo_epi8(src_line, zero);
|
| + t1 = _mm_unpackhi_epi8(src_line, zero);
|
| + s0 = _mm_adds_epu16(s0, t0);
|
| + s1 = _mm_adds_epu16(s1, t1);
|
| + ref += ref_stride;
|
| + }
|
| +
|
| + src_line = _mm_load_si128((const __m128i *)ref);
|
| + t0 = _mm_unpacklo_epi8(src_line, zero);
|
| + t1 = _mm_unpackhi_epi8(src_line, zero);
|
| + s0 = _mm_adds_epu16(s0, t0);
|
| + s1 = _mm_adds_epu16(s1, t1);
|
| +
|
| + _mm_store_si128((__m128i *)hbuf, s0);
|
| + hbuf += 8;
|
| + _mm_store_si128((__m128i *)hbuf, s1);
|
| +}
|
| +
|
| +int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) {
|
| + __m128i zero = _mm_setzero_si128();
|
| + __m128i src_line = _mm_load_si128((const __m128i *)ref);
|
| + __m128i s0 = _mm_sad_epu8(src_line, zero);
|
| + __m128i s1;
|
| + (void) width; // width = 64
|
| +
|
| + ref += 16;
|
| + src_line = _mm_load_si128((const __m128i *)ref);
|
| + s1 = _mm_sad_epu8(src_line, zero);
|
| + s0 = _mm_adds_epu16(s0, s1);
|
| +
|
| + ref += 16;
|
| + src_line = _mm_load_si128((const __m128i *)ref);
|
| + s1 = _mm_sad_epu8(src_line, zero);
|
| + s0 = _mm_adds_epu16(s0, s1);
|
| +
|
| + ref += 16;
|
| + src_line = _mm_load_si128((const __m128i *)ref);
|
| + s1 = _mm_sad_epu8(src_line, zero);
|
| + s0 = _mm_adds_epu16(s0, s1);
|
| +
|
| + s1 = _mm_srli_si128(s0, 8);
|
| + s0 = _mm_adds_epu16(s0, s1);
|
| +
|
| + return _mm_extract_epi16(s0, 0);
|
| +}
|
| +
|
| +int vp9_vector_sad_sse2(int16_t const *ref, int16_t const *src,
|
| + const int width) {
|
| + int idx;
|
| + __m128i zero = _mm_setzero_si128();
|
| + __m128i sum;
|
| + __m128i v0 = _mm_loadu_si128((const __m128i *)ref);
|
| + __m128i v1 = _mm_load_si128((const __m128i *)src);
|
| + __m128i diff = _mm_subs_epi16(v0, v1);
|
| + __m128i sign = _mm_srai_epi16(diff, 15);
|
| +
|
| + diff = _mm_xor_si128(diff, sign);
|
| + sum = _mm_sub_epi16(diff, sign);
|
| +
|
| + (void) width; // width = 64;
|
| +
|
| + ref += 8;
|
| + src += 8;
|
| +
|
| + v0 = _mm_unpacklo_epi16(sum, zero);
|
| + v1 = _mm_unpackhi_epi16(sum, zero);
|
| + sum = _mm_add_epi32(v0, v1);
|
| +
|
| + for (idx = 1; idx < 8; ++idx) {
|
| + v0 = _mm_loadu_si128((const __m128i *)ref);
|
| + v1 = _mm_load_si128((const __m128i *)src);
|
| + diff = _mm_subs_epi16(v0, v1);
|
| + sign = _mm_srai_epi16(diff, 15);
|
| + diff = _mm_xor_si128(diff, sign);
|
| + diff = _mm_sub_epi16(diff, sign);
|
| +
|
| + v0 = _mm_unpacklo_epi16(diff, zero);
|
| + v1 = _mm_unpackhi_epi16(diff, zero);
|
| +
|
| + sum = _mm_add_epi32(sum, v0);
|
| + sum = _mm_add_epi32(sum, v1);
|
| +
|
| + ref += 8;
|
| + src += 8;
|
| + }
|
| +
|
| + v0 = _mm_srli_si128(sum, 8);
|
| + sum = _mm_add_epi32(sum, v0);
|
| + v0 = _mm_srli_epi64(sum, 32);
|
| + sum = _mm_add_epi32(sum, v0);
|
| +
|
| + return _mm_cvtsi128_si32(sum);
|
| +}
|
|
|