Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(91)

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c

Issue 341293003: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 93 matching lines...) Expand 10 before | Expand all | Expand 10 after
104 for (i = output_height; i > 1; i-=2) { 104 for (i = output_height; i > 1; i-=2) {
105 // load the 2 strides of source 105 // load the 2 strides of source
106 srcReg32b1 = _mm256_castsi128_si256( 106 srcReg32b1 = _mm256_castsi128_si256(
107 _mm_loadu_si128((__m128i *)(src_ptr-3))); 107 _mm_loadu_si128((__m128i *)(src_ptr-3)));
108 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, 108 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
109 _mm_loadu_si128((__m128i *) 109 _mm_loadu_si128((__m128i *)
110 (src_ptr+src_pixels_per_line-3)), 1); 110 (src_ptr+src_pixels_per_line-3)), 1);
111 111
112 // filter the source buffer 112 // filter the source buffer
113 srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); 113 srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
114 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); 114 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
115 115
116 // multiply 2 adjacent elements with the filter and add the result 116 // multiply 2 adjacent elements with the filter and add the result
117 srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); 117 srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
118 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); 118 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
119 119
120 // add and saturate the results together 120 // add and saturate the results together
121 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); 121 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
122 122
123 // filter the source buffer 123 // filter the source buffer
124 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); 124 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
125 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); 125 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
126 126
127 // multiply 2 adjacent elements with the filter and add the result 127 // multiply 2 adjacent elements with the filter and add the result
128 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters); 128 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
129 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); 129 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
130 130
131 // add and saturate the results together 131 // add and saturate the results together
132 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, 132 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
133 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); 133 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
134 134
135 // reading 2 strides of the next 16 bytes 135 // reading 2 strides of the next 16 bytes
136 // (part of it was being read by earlier read) 136 // (part of it was being read by earlier read)
137 srcReg32b2 = _mm256_castsi128_si256( 137 srcReg32b2 = _mm256_castsi128_si256(
138 _mm_loadu_si128((__m128i *)(src_ptr+5))); 138 _mm_loadu_si128((__m128i *)(src_ptr+5)));
139 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, 139 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
140 _mm_loadu_si128((__m128i *) 140 _mm_loadu_si128((__m128i *)
141 (src_ptr+src_pixels_per_line+5)), 1); 141 (src_ptr+src_pixels_per_line+5)), 1);
142 142
143 // add and saturate the results together 143 // add and saturate the results together
144 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, 144 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
145 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); 145 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
146 146
147 // filter the source buffer 147 // filter the source buffer
148 srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); 148 srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
149 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); 149 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
150 150
151 // multiply 2 adjacent elements with the filter and add the result 151 // multiply 2 adjacent elements with the filter and add the result
152 srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); 152 srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
153 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); 153 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
154 154
155 // add and saturate the results together 155 // add and saturate the results together
156 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); 156 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
157 157
158 // filter the source buffer 158 // filter the source buffer
159 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt4Reg); 159 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
160 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg); 160 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
161 161
162 // multiply 2 adjacent elements with the filter and add the result 162 // multiply 2 adjacent elements with the filter and add the result
163 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters); 163 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
164 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); 164 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
165 165
166 // add and saturate the results together 166 // add and saturate the results together
167 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, 167 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
168 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); 168 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
169 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, 169 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
170 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); 170 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
171 171
172 172
173 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); 173 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
(...skipping 27 matching lines...) Expand all
201 if (i > 0) { 201 if (i > 0) {
202 __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; 202 __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
203 __m128i srcRegFilt2, srcRegFilt3; 203 __m128i srcRegFilt2, srcRegFilt3;
204 204
205 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); 205 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
206 206
207 // filter the source buffer 207 // filter the source buffer
208 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, 208 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
209 _mm256_castsi256_si128(filt1Reg)); 209 _mm256_castsi256_si128(filt1Reg));
210 srcRegFilt2 = _mm_shuffle_epi8(srcReg1, 210 srcRegFilt2 = _mm_shuffle_epi8(srcReg1,
211 _mm256_castsi256_si128(filt2Reg)); 211 _mm256_castsi256_si128(filt4Reg));
212 212
213 // multiply 2 adjacent elements with the filter and add the result 213 // multiply 2 adjacent elements with the filter and add the result
214 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, 214 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
215 _mm256_castsi256_si128(firstFilters)); 215 _mm256_castsi256_si128(firstFilters));
216 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, 216 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
217 _mm256_castsi256_si128(secondFilters)); 217 _mm256_castsi256_si128(forthFilters));
218 218
219 // add and saturate the results together 219 // add and saturate the results together
220 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); 220 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
221 221
222 // filter the source buffer 222 // filter the source buffer
223 srcRegFilt3= _mm_shuffle_epi8(srcReg1, 223 srcRegFilt3= _mm_shuffle_epi8(srcReg1,
224 _mm256_castsi256_si128(filt4Reg)); 224 _mm256_castsi256_si128(filt2Reg));
225 srcRegFilt2= _mm_shuffle_epi8(srcReg1, 225 srcRegFilt2= _mm_shuffle_epi8(srcReg1,
226 _mm256_castsi256_si128(filt3Reg)); 226 _mm256_castsi256_si128(filt3Reg));
227 227
228 // multiply 2 adjacent elements with the filter and add the result 228 // multiply 2 adjacent elements with the filter and add the result
229 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, 229 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
230 _mm256_castsi256_si128(forthFilters)); 230 _mm256_castsi256_si128(secondFilters));
231 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, 231 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
232 _mm256_castsi256_si128(thirdFilters)); 232 _mm256_castsi256_si128(thirdFilters));
233 233
234 // add and saturate the results together 234 // add and saturate the results together
235 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, 235 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
236 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); 236 _mm_min_epi16(srcRegFilt3, srcRegFilt2));
237 237
238 // reading the next 16 bytes 238 // reading the next 16 bytes
239 // (part of it was being read by earlier read) 239 // (part of it was being read by earlier read)
240 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); 240 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
241 241
242 // add and saturate the results together 242 // add and saturate the results together
243 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, 243 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
244 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); 244 _mm_max_epi16(srcRegFilt3, srcRegFilt2));
245 245
246 // filter the source buffer 246 // filter the source buffer
247 srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, 247 srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,
248 _mm256_castsi256_si128(filt1Reg)); 248 _mm256_castsi256_si128(filt1Reg));
249 srcRegFilt2 = _mm_shuffle_epi8(srcReg2, 249 srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
250 _mm256_castsi256_si128(filt2Reg)); 250 _mm256_castsi256_si128(filt4Reg));
251 251
252 // multiply 2 adjacent elements with the filter and add the result 252 // multiply 2 adjacent elements with the filter and add the result
253 srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, 253 srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1,
254 _mm256_castsi256_si128(firstFilters)); 254 _mm256_castsi256_si128(firstFilters));
255 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, 255 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
256 _mm256_castsi256_si128(secondFilters)); 256 _mm256_castsi256_si128(forthFilters));
257 257
258 // add and saturate the results together 258 // add and saturate the results together
259 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); 259 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
260 260
261 // filter the source buffer 261 // filter the source buffer
262 srcRegFilt3 = _mm_shuffle_epi8(srcReg2, 262 srcRegFilt3 = _mm_shuffle_epi8(srcReg2,
263 _mm256_castsi256_si128(filt4Reg)); 263 _mm256_castsi256_si128(filt2Reg));
264 srcRegFilt2 = _mm_shuffle_epi8(srcReg2, 264 srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
265 _mm256_castsi256_si128(filt3Reg)); 265 _mm256_castsi256_si128(filt3Reg));
266 266
267 // multiply 2 adjacent elements with the filter and add the result 267 // multiply 2 adjacent elements with the filter and add the result
268 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, 268 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
269 _mm256_castsi256_si128(forthFilters)); 269 _mm256_castsi256_si128(secondFilters));
270 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, 270 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
271 _mm256_castsi256_si128(thirdFilters)); 271 _mm256_castsi256_si128(thirdFilters));
272 272
273 // add and saturate the results together 273 // add and saturate the results together
274 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, 274 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
275 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); 275 _mm_min_epi16(srcRegFilt3, srcRegFilt2));
276 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, 276 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
277 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); 277 _mm_max_epi16(srcRegFilt3, srcRegFilt2));
278 278
279 279
(...skipping 255 matching lines...) Expand 10 before | Expand all | Expand 10 after
535 535
536 // shrink to 8 bit each 16 bits, the first lane contain the first 536 // shrink to 8 bit each 16 bits, the first lane contain the first
537 // convolve result and the second lane contain the second convolve 537 // convolve result and the second lane contain the second convolve
538 // result 538 // result
539 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); 539 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
540 540
541 // save 16 bytes 541 // save 16 bytes
542 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); 542 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
543 } 543 }
544 } 544 }
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/x86/vp9_postproc_sse2.asm ('k') | source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698