Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(87)

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c

Issue 996503002: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/libvpx/vp9/encoder/vp9_svc_layercontext.c ('k') | source/libvpx/vp9/vp9_cx_iface.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 72 matching lines...) Expand 10 before | Expand all | Expand 10 after
83 s1 = _mm_adds_epu16(s1, t1); 83 s1 = _mm_adds_epu16(s1, t1);
84 ref += ref_stride; 84 ref += ref_stride;
85 } 85 }
86 86
87 src_line = _mm_load_si128((const __m128i *)ref); 87 src_line = _mm_load_si128((const __m128i *)ref);
88 t0 = _mm_unpacklo_epi8(src_line, zero); 88 t0 = _mm_unpacklo_epi8(src_line, zero);
89 t1 = _mm_unpackhi_epi8(src_line, zero); 89 t1 = _mm_unpackhi_epi8(src_line, zero);
90 s0 = _mm_adds_epu16(s0, t0); 90 s0 = _mm_adds_epu16(s0, t0);
91 s1 = _mm_adds_epu16(s1, t1); 91 s1 = _mm_adds_epu16(s1, t1);
92 92
93 if (height == 64) {
94 s0 = _mm_srai_epi16(s0, 5);
95 s1 = _mm_srai_epi16(s1, 5);
96 } else if (height == 32) {
97 s0 = _mm_srai_epi16(s0, 4);
98 s1 = _mm_srai_epi16(s1, 4);
99 } else {
100 s0 = _mm_srai_epi16(s0, 3);
101 s1 = _mm_srai_epi16(s1, 3);
102 }
103
93 _mm_store_si128((__m128i *)hbuf, s0); 104 _mm_store_si128((__m128i *)hbuf, s0);
94 hbuf += 8; 105 hbuf += 8;
95 _mm_store_si128((__m128i *)hbuf, s1); 106 _mm_store_si128((__m128i *)hbuf, s1);
96 } 107 }
97 108
98 int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) { 109 int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) {
99 __m128i zero = _mm_setzero_si128(); 110 __m128i zero = _mm_setzero_si128();
100 __m128i src_line = _mm_load_si128((const __m128i *)ref); 111 __m128i src_line = _mm_load_si128((const __m128i *)ref);
101 __m128i s0 = _mm_sad_epu8(src_line, zero); 112 __m128i s0 = _mm_sad_epu8(src_line, zero);
102 __m128i s1; 113 __m128i s1;
103 (void) width; // width = 64 114 int i;
115 const int norm_factor = 3 + (width >> 5);
104 116
105 ref += 16; 117 for (i = 16; i < width; i += 16) {
106 src_line = _mm_load_si128((const __m128i *)ref); 118 ref += 16;
107 s1 = _mm_sad_epu8(src_line, zero); 119 src_line = _mm_load_si128((const __m128i *)ref);
108 s0 = _mm_adds_epu16(s0, s1); 120 s1 = _mm_sad_epu8(src_line, zero);
109 121 s0 = _mm_adds_epu16(s0, s1);
110 ref += 16; 122 }
111 src_line = _mm_load_si128((const __m128i *)ref);
112 s1 = _mm_sad_epu8(src_line, zero);
113 s0 = _mm_adds_epu16(s0, s1);
114
115 ref += 16;
116 src_line = _mm_load_si128((const __m128i *)ref);
117 s1 = _mm_sad_epu8(src_line, zero);
118 s0 = _mm_adds_epu16(s0, s1);
119 123
120 s1 = _mm_srli_si128(s0, 8); 124 s1 = _mm_srli_si128(s0, 8);
121 s0 = _mm_adds_epu16(s0, s1); 125 s0 = _mm_adds_epu16(s0, s1);
122 126
123 return _mm_extract_epi16(s0, 0); 127 return _mm_extract_epi16(s0, 0) >> norm_factor;
124 } 128 }
125 129
126 int vp9_vector_sad_sse2(int16_t const *ref, int16_t const *src, 130 int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src,
127 const int width) { 131 const int bwl) {
128 int idx; 132 int idx;
129 __m128i zero = _mm_setzero_si128(); 133 int width = 4 << bwl;
130 __m128i sum; 134 int16_t mean;
131 __m128i v0 = _mm_loadu_si128((const __m128i *)ref); 135 __m128i v0 = _mm_loadu_si128((const __m128i *)ref);
132 __m128i v1 = _mm_load_si128((const __m128i *)src); 136 __m128i v1 = _mm_load_si128((const __m128i *)src);
133 __m128i diff = _mm_subs_epi16(v0, v1); 137 __m128i diff = _mm_subs_epi16(v0, v1);
134 __m128i sign = _mm_srai_epi16(diff, 15); 138 __m128i sum = diff;
135 139 __m128i sse = _mm_madd_epi16(diff, diff);
136 diff = _mm_xor_si128(diff, sign);
137 sum = _mm_sub_epi16(diff, sign);
138
139 (void) width; // width = 64;
140 140
141 ref += 8; 141 ref += 8;
142 src += 8; 142 src += 8;
143 143
144 v0 = _mm_unpacklo_epi16(sum, zero); 144 for (idx = 8; idx < width; idx += 8) {
145 v1 = _mm_unpackhi_epi16(sum, zero);
146 sum = _mm_add_epi32(v0, v1);
147
148 for (idx = 1; idx < 8; ++idx) {
149 v0 = _mm_loadu_si128((const __m128i *)ref); 145 v0 = _mm_loadu_si128((const __m128i *)ref);
150 v1 = _mm_load_si128((const __m128i *)src); 146 v1 = _mm_load_si128((const __m128i *)src);
151 diff = _mm_subs_epi16(v0, v1); 147 diff = _mm_subs_epi16(v0, v1);
152 sign = _mm_srai_epi16(diff, 15);
153 diff = _mm_xor_si128(diff, sign);
154 diff = _mm_sub_epi16(diff, sign);
155 148
156 v0 = _mm_unpacklo_epi16(diff, zero); 149 sum = _mm_add_epi16(sum, diff);
157 v1 = _mm_unpackhi_epi16(diff, zero); 150 v0 = _mm_madd_epi16(diff, diff);
158 151 sse = _mm_add_epi32(sse, v0);
159 sum = _mm_add_epi32(sum, v0);
160 sum = _mm_add_epi32(sum, v1);
161 152
162 ref += 8; 153 ref += 8;
163 src += 8; 154 src += 8;
164 } 155 }
165 156
166 v0 = _mm_srli_si128(sum, 8); 157 v0 = _mm_srli_si128(sum, 8);
167 sum = _mm_add_epi32(sum, v0); 158 sum = _mm_add_epi16(sum, v0);
168 v0 = _mm_srli_epi64(sum, 32); 159 v0 = _mm_srli_epi64(sum, 32);
169 sum = _mm_add_epi32(sum, v0); 160 sum = _mm_add_epi16(sum, v0);
161 v0 = _mm_srli_epi32(sum, 16);
162 sum = _mm_add_epi16(sum, v0);
170 163
171 return _mm_cvtsi128_si32(sum); 164 v1 = _mm_srli_si128(sse, 8);
165 sse = _mm_add_epi32(sse, v1);
166 v1 = _mm_srli_epi64(sse, 32);
167 sse = _mm_add_epi32(sse, v1);
168
169 mean = _mm_extract_epi16(sum, 0);
170
171 return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));
172 } 172 }
OLDNEW
« no previous file with comments | « source/libvpx/vp9/encoder/vp9_svc_layercontext.c ('k') | source/libvpx/vp9/vp9_cx_iface.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698