Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(52)

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c

Issue 23440041: Libvpx: Pull from upstream (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 7 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « source/libvpx/vp9/common/x86/vp9_asm_stubs.c ('k') | source/libvpx/vp9/encoder/vp9_block.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #include <tmmintrin.h>
2 #include <emmintrin.h>
3 #include "vpx_ports/mem.h"
4 #include "vpx_ports/emmintrin_compat.h"
5 #pragma GCC push_options
6 #pragma GCC optimize("unroll-loops")
7
8 DECLARE_ALIGNED(16, const unsigned char, filt1_4_h8[16]) = {
9 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
10 };
11 DECLARE_ALIGNED(16, const unsigned char, filt2_4_h8[16]) = {
12 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
13 };
14 DECLARE_ALIGNED(16, const unsigned char, filt1_global[16]) = {
15 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
16 };
17 DECLARE_ALIGNED(16, const unsigned char, filt2_global[16]) = {
18 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
19 };
20 DECLARE_ALIGNED(16, const unsigned char, filt3_global[16]) = {
21 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
22 };
23 DECLARE_ALIGNED(16, const unsigned char, filt4_global[16]) = {
24 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
25 };
26
27 void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
28 unsigned int src_pixels_per_line,
29 unsigned char *output_ptr,
30 unsigned int output_pitch,
31 unsigned int output_height,
32 short *filter) {
33 __m128i addFilterReg127, filtersReg, firstFilters, secondFilters,
34 thirdFilters, forthFilters, srcReg, srcRegFilt1, srcRegFilt2,
35 srcRegFilt3, srcRegFilt4;
36 unsigned int i;
37
38 addFilterReg127 = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)0x0400040u), 0);
39 filtersReg = _mm_loadu_si128((__m128i *)filter);
40 filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
41 firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
42 secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
43 firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
44 secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
45 thirdFilters =_mm_load_si128((__m128i const *)filt1_4_h8);
46 forthFilters = _mm_load_si128((__m128i const *)filt2_4_h8);
47
48 for (i = 0; i < output_height; i++) {
49 srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
50 srcRegFilt1= _mm_shuffle_epi8(srcReg, thirdFilters);
51 srcRegFilt2= _mm_shuffle_epi8(srcReg, forthFilters);
52 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
53 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
54 srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
55 srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
56 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
57 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
58 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
59 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg127);
60 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
61 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
62 src_ptr += src_pixels_per_line;
63 *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
64 output_ptr += output_pitch;
65 }
66 }
67
68 void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
69 unsigned int src_pixels_per_line,
70 unsigned char *output_ptr,
71 unsigned int output_pitch,
72 unsigned int output_height,
73 short *filter) {
74 __m128i addFilterReg127, filtersReg, firstFilters, secondFilters,
75 thirdFilters, forthFilters, srcReg, srcRegFilt1, srcRegFilt2,
76 srcRegFilt3, srcRegFilt4;
77 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
78 unsigned int i;
79
80 addFilterReg127 = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)0x0400040u), 0);
81 filtersReg = _mm_loadu_si128((__m128i *)filter);
82 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
83
84 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
85 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
86 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
87 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
88
89 filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
90 filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
91 filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
92 filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
93
94 for (i = 0 ; i < output_height ; i++) {
95 srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
96 srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
97 srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
98 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
99 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
100 srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
101 srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
102 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
103 srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
104 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
105 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
106 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
107 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg127);
108 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
109 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
110 src_ptr += src_pixels_per_line;
111 _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
112 output_ptr += output_pitch;
113 }
114 }
115
116 void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
117 unsigned int src_pixels_per_line,
118 unsigned char *output_ptr,
119 unsigned int output_pitch,
120 unsigned int output_height,
121 short *filter) {
122 __m128i addFilterReg127, filtersReg, firstFilters, secondFilters,
123 thirdFilters, forthFilters, srcReg1, srcReg2, srcRegFilt1_1,
124 srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
125 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
126 unsigned int i;
127
128 addFilterReg127 = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)0x0400040u), 0);
129 filtersReg = _mm_loadu_si128((__m128i *)filter);
130 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
131
132 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
133 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
134 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
135 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
136
137 filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
138 filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
139 filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
140 filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
141
142 for (i = 0 ; i < output_height ; i++) {
143 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
144 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, filt1Reg);
145 srcRegFilt2 = _mm_shuffle_epi8(srcReg1, filt2Reg);
146
147 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
148 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
149
150 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
151
152 srcRegFilt3 = _mm_shuffle_epi8(srcReg1, filt4Reg);
153 srcRegFilt2 = _mm_shuffle_epi8(srcReg1, filt3Reg);
154
155 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
156 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
157
158 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt3);
159
160 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
161
162 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
163
164 srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, filt1Reg);
165 srcRegFilt2 = _mm_shuffle_epi8(srcReg2, filt2Reg);
166
167 srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
168 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
169
170 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
171
172 srcRegFilt3 = _mm_shuffle_epi8(srcReg2, filt4Reg);
173 srcRegFilt2 = _mm_shuffle_epi8(srcReg2, filt3Reg);
174
175 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
176 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
177
178 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt3);
179 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
180 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg127);
181 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg127);
182 srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
183 srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
184 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
185
186 src_ptr += src_pixels_per_line;
187 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
188 output_ptr += output_pitch;
189 }
190 }
191
192 void vp9_filter_block1d4_v8_intrin_ssse3(unsigned char *src_ptr,
193 unsigned int src_pitch,
194 unsigned char *output_ptr,
195 unsigned int out_pitch,
196 unsigned int output_height,
197 short *filter) {
198 __m128i addFilterReg127, filtersReg, firstFilters, secondFilters, srcRegFilt1,
199 srcRegFilt2, srcRegFilt3, srcRegFilt4;
200 unsigned int i;
201
202 addFilterReg127 = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)0x0400040u), 0);
203 filtersReg = _mm_loadu_si128((__m128i *)filter);
204 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
205 firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
206 firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
207 secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
208 secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
209
210 for (i = 0 ; i < output_height ; i++) {
211 srcRegFilt1 = _mm_cvtsi32_si128(*((int*)&src_ptr[0]));
212 srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch)[0]));
213
214 srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
215
216 srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*2)[0]));
217 srcRegFilt3 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*3)[0]));
218
219 srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
220
221 srcRegFilt3 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*4)[0]));
222 srcRegFilt4 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*5)[0]));
223
224 srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
225 srcRegFilt1 = _mm_unpacklo_epi64(srcRegFilt1, srcRegFilt2);
226
227 srcRegFilt4 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*6)[0]));
228 srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*7)[0]));
229
230 srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt4, srcRegFilt2);
231 srcRegFilt3 = _mm_unpacklo_epi64(srcRegFilt3, srcRegFilt4);
232
233 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
234 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
235 srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8);
236 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm_srli_si128(srcRegFilt3, 8));
237 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
238 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
239 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg127);
240 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
241 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
242
243 src_ptr += src_pitch;
244 *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
245 output_ptr += out_pitch;
246 }
247 }
248
249 void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
250 unsigned int src_pitch,
251 unsigned char *output_ptr,
252 unsigned int out_pitch,
253 unsigned int output_height,
254 short *filter) {
255 __m128i addFilterReg127, filtersReg, firstFilters, secondFilters,
256 thirdFilters, forthFilters, srcRegFilt1, srcRegFilt2, srcRegFilt3,
257 srcRegFilt4, srcRegFilt5, srcRegFilt6;
258 unsigned int i;
259
260 addFilterReg127 = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)0x0400040u), 0);
261 filtersReg = _mm_loadu_si128((__m128i *)filter);
262 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
263
264 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
265 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
266 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
267 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
268
269 for (i = 0 ; i < output_height ; i++) {
270 srcRegFilt1 = _mm_loadl_epi64((__m128i*)&src_ptr[0]);
271 srcRegFilt2 = _mm_loadl_epi64((__m128i*)&(src_ptr+src_pitch)[0]);
272
273 srcRegFilt3 = _mm_loadl_epi64((__m128i*)&(src_ptr+src_pitch*2)[0]);
274 srcRegFilt4 = _mm_loadl_epi64((__m128i*)&(src_ptr+src_pitch*3)[0]);
275
276 srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
277
278 srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
279
280 srcRegFilt2 = _mm_loadl_epi64((__m128i*)&(src_ptr+src_pitch*4)[0]);
281 srcRegFilt4 = _mm_loadl_epi64((__m128i*)&(src_ptr+src_pitch*5)[0]);
282
283 srcRegFilt5 = _mm_loadl_epi64((__m128i*)&(src_ptr+src_pitch*6)[0]);
284 srcRegFilt6 = _mm_loadl_epi64((__m128i*)&(src_ptr+src_pitch*7)[0]);
285
286 srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4);
287 srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6);
288 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
289 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
290 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
291 srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
292 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
293 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
294 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
295 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg127);
296 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
297 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
298
299 src_ptr += src_pitch;
300 _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
301 output_ptr += out_pitch;
302 }
303 }
304
305 void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
306 unsigned int src_pitch,
307 unsigned char *output_ptr,
308 unsigned int out_pitch,
309 unsigned int output_height,
310 short *filter) {
311 __m128i addFilterReg127, filtersReg, firstFilters, secondFilters,
312 thirdFilters, forthFilters, srcRegFilt1, srcRegFilt2, srcRegFilt3,
313 srcRegFilt4, srcRegFilt5, srcRegFilt6;
314 unsigned int i;
315
316 addFilterReg127 = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)0x0400040u), 0);
317 filtersReg = _mm_loadu_si128((__m128i *)filter);
318 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
319
320 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
321 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
322 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
323 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
324
325 for (i = 0 ; i < output_height ; i++) {
326 srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr));
327 srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch));
328
329 srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6));
330 srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
331
332 srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
333 srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
334 srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2);
335 srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4);
336
337 srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
338 srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);
339 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
340 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
341
342 srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
343 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
344
345 srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2));
346 srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3));
347
348 srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
349 srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
350 srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters);
351 srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
352
353 srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt4);
354 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt6);
355
356 srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4));
357 srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5));
358
359 srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
360 srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
361 srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, thirdFilters);
362 srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, thirdFilters);
363 srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt4);
364 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt6);
365 srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg127);
366 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg127);
367 srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7);
368 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
369 srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);
370
371 src_ptr += src_pitch;
372 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
373 output_ptr += out_pitch;
374 }
375 }
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/x86/vp9_asm_stubs.c ('k') | source/libvpx/vp9/encoder/vp9_block.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698