OLD | NEW |
| (Empty) |
1 #include <tmmintrin.h> | |
2 #include <emmintrin.h> | |
3 #include "vpx_ports/mem.h" | |
4 #include "vpx_ports/emmintrin_compat.h" | |
5 #pragma GCC push_options | |
6 #pragma GCC optimize("unroll-loops") | |
7 | |
8 DECLARE_ALIGNED(16, const unsigned char, filt1_4_h8[16]) = { | |
9 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 | |
10 }; | |
11 DECLARE_ALIGNED(16, const unsigned char, filt2_4_h8[16]) = { | |
12 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10 | |
13 }; | |
14 DECLARE_ALIGNED(16, const unsigned char, filt1_global[16]) = { | |
15 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 | |
16 }; | |
17 DECLARE_ALIGNED(16, const unsigned char, filt2_global[16]) = { | |
18 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 | |
19 }; | |
20 DECLARE_ALIGNED(16, const unsigned char, filt3_global[16]) = { | |
21 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 | |
22 }; | |
23 DECLARE_ALIGNED(16, const unsigned char, filt4_global[16]) = { | |
24 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 | |
25 }; | |
26 | |
27 void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, | |
28 unsigned int src_pixels_per_line, | |
29 unsigned char *output_ptr, | |
30 unsigned int output_pitch, | |
31 unsigned int output_height, | |
32 short *filter) { | |
33 __m128i addFilterReg127, filtersReg, firstFilters, secondFilters, | |
34 thirdFilters, forthFilters, srcReg, srcRegFilt1, srcRegFilt2, | |
35 srcRegFilt3, srcRegFilt4; | |
36 unsigned int i; | |
37 | |
38 addFilterReg127 = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)0x0400040u), 0); | |
39 filtersReg = _mm_loadu_si128((__m128i *)filter); | |
40 filtersReg = _mm_packs_epi16(filtersReg, filtersReg); | |
41 firstFilters = _mm_shufflelo_epi16(filtersReg, 0); | |
42 secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); | |
43 firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); | |
44 secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); | |
45 thirdFilters =_mm_load_si128((__m128i const *)filt1_4_h8); | |
46 forthFilters = _mm_load_si128((__m128i const *)filt2_4_h8); | |
47 | |
48 for (i = 0; i < output_height; i++) { | |
49 srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); | |
50 srcRegFilt1= _mm_shuffle_epi8(srcReg, thirdFilters); | |
51 srcRegFilt2= _mm_shuffle_epi8(srcReg, forthFilters); | |
52 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); | |
53 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); | |
54 srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); | |
55 srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); | |
56 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); | |
57 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); | |
58 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); | |
59 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg127); | |
60 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); | |
61 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); | |
62 src_ptr += src_pixels_per_line; | |
63 *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1); | |
64 output_ptr += output_pitch; | |
65 } | |
66 } | |
67 | |
68 void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr, | |
69 unsigned int src_pixels_per_line, | |
70 unsigned char *output_ptr, | |
71 unsigned int output_pitch, | |
72 unsigned int output_height, | |
73 short *filter) { | |
74 __m128i addFilterReg127, filtersReg, firstFilters, secondFilters, | |
75 thirdFilters, forthFilters, srcReg, srcRegFilt1, srcRegFilt2, | |
76 srcRegFilt3, srcRegFilt4; | |
77 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; | |
78 unsigned int i; | |
79 | |
80 addFilterReg127 = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)0x0400040u), 0); | |
81 filtersReg = _mm_loadu_si128((__m128i *)filter); | |
82 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | |
83 | |
84 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); | |
85 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); | |
86 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); | |
87 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); | |
88 | |
89 filt1Reg = _mm_load_si128((__m128i const *)filt1_global); | |
90 filt2Reg = _mm_load_si128((__m128i const *)filt2_global); | |
91 filt3Reg = _mm_load_si128((__m128i const *)filt3_global); | |
92 filt4Reg = _mm_load_si128((__m128i const *)filt4_global); | |
93 | |
94 for (i = 0 ; i < output_height ; i++) { | |
95 srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); | |
96 srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg); | |
97 srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg); | |
98 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); | |
99 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); | |
100 srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg); | |
101 srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg); | |
102 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); | |
103 srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); | |
104 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); | |
105 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); | |
106 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); | |
107 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg127); | |
108 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); | |
109 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); | |
110 src_ptr += src_pixels_per_line; | |
111 _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); | |
112 output_ptr += output_pitch; | |
113 } | |
114 } | |
115 | |
116 void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, | |
117 unsigned int src_pixels_per_line, | |
118 unsigned char *output_ptr, | |
119 unsigned int output_pitch, | |
120 unsigned int output_height, | |
121 short *filter) { | |
122 __m128i addFilterReg127, filtersReg, firstFilters, secondFilters, | |
123 thirdFilters, forthFilters, srcReg1, srcReg2, srcRegFilt1_1, | |
124 srcRegFilt2_1, srcRegFilt2, srcRegFilt3; | |
125 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; | |
126 unsigned int i; | |
127 | |
128 addFilterReg127 = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)0x0400040u), 0); | |
129 filtersReg = _mm_loadu_si128((__m128i *)filter); | |
130 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | |
131 | |
132 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); | |
133 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); | |
134 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); | |
135 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); | |
136 | |
137 filt1Reg = _mm_load_si128((__m128i const *)filt1_global); | |
138 filt2Reg = _mm_load_si128((__m128i const *)filt2_global); | |
139 filt3Reg = _mm_load_si128((__m128i const *)filt3_global); | |
140 filt4Reg = _mm_load_si128((__m128i const *)filt4_global); | |
141 | |
142 for (i = 0 ; i < output_height ; i++) { | |
143 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); | |
144 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, filt1Reg); | |
145 srcRegFilt2 = _mm_shuffle_epi8(srcReg1, filt2Reg); | |
146 | |
147 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters); | |
148 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); | |
149 | |
150 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); | |
151 | |
152 srcRegFilt3 = _mm_shuffle_epi8(srcReg1, filt4Reg); | |
153 srcRegFilt2 = _mm_shuffle_epi8(srcReg1, filt3Reg); | |
154 | |
155 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); | |
156 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); | |
157 | |
158 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt3); | |
159 | |
160 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); | |
161 | |
162 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); | |
163 | |
164 srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, filt1Reg); | |
165 srcRegFilt2 = _mm_shuffle_epi8(srcReg2, filt2Reg); | |
166 | |
167 srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters); | |
168 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); | |
169 | |
170 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); | |
171 | |
172 srcRegFilt3 = _mm_shuffle_epi8(srcReg2, filt4Reg); | |
173 srcRegFilt2 = _mm_shuffle_epi8(srcReg2, filt3Reg); | |
174 | |
175 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); | |
176 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); | |
177 | |
178 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt3); | |
179 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); | |
180 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg127); | |
181 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg127); | |
182 srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); | |
183 srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); | |
184 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); | |
185 | |
186 src_ptr += src_pixels_per_line; | |
187 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); | |
188 output_ptr += output_pitch; | |
189 } | |
190 } | |
191 | |
192 void vp9_filter_block1d4_v8_intrin_ssse3(unsigned char *src_ptr, | |
193 unsigned int src_pitch, | |
194 unsigned char *output_ptr, | |
195 unsigned int out_pitch, | |
196 unsigned int output_height, | |
197 short *filter) { | |
198 __m128i addFilterReg127, filtersReg, firstFilters, secondFilters, srcRegFilt1, | |
199 srcRegFilt2, srcRegFilt3, srcRegFilt4; | |
200 unsigned int i; | |
201 | |
202 addFilterReg127 = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)0x0400040u), 0); | |
203 filtersReg = _mm_loadu_si128((__m128i *)filter); | |
204 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | |
205 firstFilters = _mm_shufflelo_epi16(filtersReg, 0); | |
206 firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); | |
207 secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); | |
208 secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); | |
209 | |
210 for (i = 0 ; i < output_height ; i++) { | |
211 srcRegFilt1 = _mm_cvtsi32_si128(*((int*)&src_ptr[0])); | |
212 srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch)[0])); | |
213 | |
214 srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); | |
215 | |
216 srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*2)[0])); | |
217 srcRegFilt3 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*3)[0])); | |
218 | |
219 srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); | |
220 | |
221 srcRegFilt3 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*4)[0])); | |
222 srcRegFilt4 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*5)[0])); | |
223 | |
224 srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); | |
225 srcRegFilt1 = _mm_unpacklo_epi64(srcRegFilt1, srcRegFilt2); | |
226 | |
227 srcRegFilt4 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*6)[0])); | |
228 srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*7)[0])); | |
229 | |
230 srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt4, srcRegFilt2); | |
231 srcRegFilt3 = _mm_unpacklo_epi64(srcRegFilt3, srcRegFilt4); | |
232 | |
233 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); | |
234 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); | |
235 srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8); | |
236 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm_srli_si128(srcRegFilt3, 8)); | |
237 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); | |
238 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); | |
239 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg127); | |
240 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); | |
241 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); | |
242 | |
243 src_ptr += src_pitch; | |
244 *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1); | |
245 output_ptr += out_pitch; | |
246 } | |
247 } | |
248 | |
249 void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, | |
250 unsigned int src_pitch, | |
251 unsigned char *output_ptr, | |
252 unsigned int out_pitch, | |
253 unsigned int output_height, | |
254 short *filter) { | |
255 __m128i addFilterReg127, filtersReg, firstFilters, secondFilters, | |
256 thirdFilters, forthFilters, srcRegFilt1, srcRegFilt2, srcRegFilt3, | |
257 srcRegFilt4, srcRegFilt5, srcRegFilt6; | |
258 unsigned int i; | |
259 | |
260 addFilterReg127 = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)0x0400040u), 0); | |
261 filtersReg = _mm_loadu_si128((__m128i *)filter); | |
262 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | |
263 | |
264 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); | |
265 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); | |
266 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); | |
267 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); | |
268 | |
269 for (i = 0 ; i < output_height ; i++) { | |
270 srcRegFilt1 = _mm_loadl_epi64((__m128i*)&src_ptr[0]); | |
271 srcRegFilt2 = _mm_loadl_epi64((__m128i*)&(src_ptr+src_pitch)[0]); | |
272 | |
273 srcRegFilt3 = _mm_loadl_epi64((__m128i*)&(src_ptr+src_pitch*2)[0]); | |
274 srcRegFilt4 = _mm_loadl_epi64((__m128i*)&(src_ptr+src_pitch*3)[0]); | |
275 | |
276 srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); | |
277 | |
278 srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); | |
279 | |
280 srcRegFilt2 = _mm_loadl_epi64((__m128i*)&(src_ptr+src_pitch*4)[0]); | |
281 srcRegFilt4 = _mm_loadl_epi64((__m128i*)&(src_ptr+src_pitch*5)[0]); | |
282 | |
283 srcRegFilt5 = _mm_loadl_epi64((__m128i*)&(src_ptr+src_pitch*6)[0]); | |
284 srcRegFilt6 = _mm_loadl_epi64((__m128i*)&(src_ptr+src_pitch*7)[0]); | |
285 | |
286 srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4); | |
287 srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6); | |
288 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); | |
289 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); | |
290 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); | |
291 srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters); | |
292 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5); | |
293 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); | |
294 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); | |
295 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg127); | |
296 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); | |
297 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); | |
298 | |
299 src_ptr += src_pitch; | |
300 _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); | |
301 output_ptr += out_pitch; | |
302 } | |
303 } | |
304 | |
305 void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, | |
306 unsigned int src_pitch, | |
307 unsigned char *output_ptr, | |
308 unsigned int out_pitch, | |
309 unsigned int output_height, | |
310 short *filter) { | |
311 __m128i addFilterReg127, filtersReg, firstFilters, secondFilters, | |
312 thirdFilters, forthFilters, srcRegFilt1, srcRegFilt2, srcRegFilt3, | |
313 srcRegFilt4, srcRegFilt5, srcRegFilt6; | |
314 unsigned int i; | |
315 | |
316 addFilterReg127 = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)0x0400040u), 0); | |
317 filtersReg = _mm_loadu_si128((__m128i *)filter); | |
318 filtersReg =_mm_packs_epi16(filtersReg, filtersReg); | |
319 | |
320 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); | |
321 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); | |
322 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); | |
323 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); | |
324 | |
325 for (i = 0 ; i < output_height ; i++) { | |
326 srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr)); | |
327 srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch)); | |
328 | |
329 srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6)); | |
330 srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)); | |
331 | |
332 srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); | |
333 srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); | |
334 srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2); | |
335 srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4); | |
336 | |
337 srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters); | |
338 srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters); | |
339 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); | |
340 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); | |
341 | |
342 srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6); | |
343 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); | |
344 | |
345 srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2)); | |
346 srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3)); | |
347 | |
348 srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); | |
349 srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3); | |
350 srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters); | |
351 srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters); | |
352 | |
353 srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt4); | |
354 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt6); | |
355 | |
356 srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4)); | |
357 srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5)); | |
358 | |
359 srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); | |
360 srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3); | |
361 srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, thirdFilters); | |
362 srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, thirdFilters); | |
363 srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt4); | |
364 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt6); | |
365 srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg127); | |
366 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg127); | |
367 srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7); | |
368 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); | |
369 srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1); | |
370 | |
371 src_ptr += src_pitch; | |
372 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); | |
373 output_ptr += out_pitch; | |
374 } | |
375 } | |
OLD | NEW |