source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c - Issue 958693004: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c

Issue 958693004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: Created 5 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 38 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
49 s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i )(s + 2 p)), u0);	49 s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i )(s + 2 p)), u0);

50 s0 = _mm_adds_epu16(s0, s1);	50 s0 = _mm_adds_epu16(s0, s1);

51 s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i )(s + 3 p)), u0);	51 s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i )(s + 3 p)), u0);

52 s0 = _mm_adds_epu16(s0, s1);	52 s0 = _mm_adds_epu16(s0, s1);

53	53

54 s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));	54 s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));

55 s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));	55 s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));

56 avg = _mm_extract_epi16(s0, 0);	56 avg = _mm_extract_epi16(s0, 0);

57 return (avg + 8) >> 4;	57 return (avg + 8) >> 4;

58 }	58 }

	59

	60 void vp9_int_pro_row_sse2(int16_t hbuf, uint8_t constref,

	61 const int ref_stride, const int height) {

	62 int idx;

	63 __m128i zero = _mm_setzero_si128();

	64 __m128i src_line = _mm_load_si128((const __m128i *)ref);

	65 __m128i s0 = _mm_unpacklo_epi8(src_line, zero);

	66 __m128i s1 = _mm_unpackhi_epi8(src_line, zero);

	67 __m128i t0, t1;

	68 int height_1 = height - 1;

	69 ref += ref_stride;

	70

	71 for (idx = 1; idx < height_1; idx += 2) {

	72 src_line = _mm_load_si128((const __m128i *)ref);

	73 t0 = _mm_unpacklo_epi8(src_line, zero);

	74 t1 = _mm_unpackhi_epi8(src_line, zero);

	75 s0 = _mm_adds_epu16(s0, t0);

	76 s1 = _mm_adds_epu16(s1, t1);

	77 ref += ref_stride;

	78

	79 src_line = _mm_load_si128((const __m128i *)ref);

	80 t0 = _mm_unpacklo_epi8(src_line, zero);

	81 t1 = _mm_unpackhi_epi8(src_line, zero);

	82 s0 = _mm_adds_epu16(s0, t0);

	83 s1 = _mm_adds_epu16(s1, t1);

	84 ref += ref_stride;

	85 }

	86

	87 src_line = _mm_load_si128((const __m128i *)ref);

	88 t0 = _mm_unpacklo_epi8(src_line, zero);

	89 t1 = _mm_unpackhi_epi8(src_line, zero);

	90 s0 = _mm_adds_epu16(s0, t0);

	91 s1 = _mm_adds_epu16(s1, t1);

	92

	93 _mm_store_si128((__m128i *)hbuf, s0);

	94 hbuf += 8;

	95 _mm_store_si128((__m128i *)hbuf, s1);

	96 }

	97

	98 int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) {

	99 __m128i zero = _mm_setzero_si128();

	100 __m128i src_line = _mm_load_si128((const __m128i *)ref);

	101 __m128i s0 = _mm_sad_epu8(src_line, zero);

	102 __m128i s1;

	103 (void) width; // width = 64

	104

	105 ref += 16;

	106 src_line = _mm_load_si128((const __m128i *)ref);

	107 s1 = _mm_sad_epu8(src_line, zero);

	108 s0 = _mm_adds_epu16(s0, s1);

	109

	110 ref += 16;

	111 src_line = _mm_load_si128((const __m128i *)ref);

	112 s1 = _mm_sad_epu8(src_line, zero);

	113 s0 = _mm_adds_epu16(s0, s1);

	114

	115 ref += 16;

	116 src_line = _mm_load_si128((const __m128i *)ref);

	117 s1 = _mm_sad_epu8(src_line, zero);

	118 s0 = _mm_adds_epu16(s0, s1);

	119

	120 s1 = _mm_srli_si128(s0, 8);

	121 s0 = _mm_adds_epu16(s0, s1);

	122

	123 return _mm_extract_epi16(s0, 0);

	124 }

	125

	126 int vp9_vector_sad_sse2(int16_t const ref, int16_t const src,

	127 const int width) {

	128 int idx;

	129 __m128i zero = _mm_setzero_si128();

	130 __m128i sum;

	131 __m128i v0 = _mm_loadu_si128((const __m128i *)ref);

	132 __m128i v1 = _mm_load_si128((const __m128i *)src);

	133 __m128i diff = _mm_subs_epi16(v0, v1);

	134 __m128i sign = _mm_srai_epi16(diff, 15);

	135

	136 diff = _mm_xor_si128(diff, sign);

	137 sum = _mm_sub_epi16(diff, sign);

	138

	139 (void) width; // width = 64;

	140

	141 ref += 8;

	142 src += 8;

	143

	144 v0 = _mm_unpacklo_epi16(sum, zero);

	145 v1 = _mm_unpackhi_epi16(sum, zero);

	146 sum = _mm_add_epi32(v0, v1);

	147

	148 for (idx = 1; idx < 8; ++idx) {

	149 v0 = _mm_loadu_si128((const __m128i *)ref);

	150 v1 = _mm_load_si128((const __m128i *)src);

	151 diff = _mm_subs_epi16(v0, v1);

	152 sign = _mm_srai_epi16(diff, 15);

	153 diff = _mm_xor_si128(diff, sign);

	154 diff = _mm_sub_epi16(diff, sign);

	155

	156 v0 = _mm_unpacklo_epi16(diff, zero);

	157 v1 = _mm_unpackhi_epi16(diff, zero);

	158

	159 sum = _mm_add_epi32(sum, v0);

	160 sum = _mm_add_epi32(sum, v1);

	161

	162 ref += 8;

	163 src += 8;

	164 }

	165

	166 v0 = _mm_srli_si128(sum, 8);

	167 sum = _mm_add_epi32(sum, v0);

	168 v0 = _mm_srli_epi64(sum, 32);

	169 sum = _mm_add_epi32(sum, v0);

	170

	171 return _mm_cvtsi128_si32(sum);

	172 }

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/encoder/vp9_speed_features.c ('k') | source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c » ('j') | no next file with comments »