OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 93 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
104 for (i = output_height; i > 1; i-=2) { | 104 for (i = output_height; i > 1; i-=2) { |
105 // load the 2 strides of source | 105 // load the 2 strides of source |
106 srcReg32b1 = _mm256_castsi128_si256( | 106 srcReg32b1 = _mm256_castsi128_si256( |
107 _mm_loadu_si128((__m128i *)(src_ptr-3))); | 107 _mm_loadu_si128((__m128i *)(src_ptr-3))); |
108 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, | 108 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, |
109 _mm_loadu_si128((__m128i *) | 109 _mm_loadu_si128((__m128i *) |
110 (src_ptr+src_pixels_per_line-3)), 1); | 110 (src_ptr+src_pixels_per_line-3)), 1); |
111 | 111 |
112 // filter the source buffer | 112 // filter the source buffer |
113 srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); | 113 srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); |
114 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); | 114 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); |
115 | 115 |
116 // multiply 2 adjacent elements with the filter and add the result | 116 // multiply 2 adjacent elements with the filter and add the result |
117 srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); | 117 srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); |
118 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); | 118 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); |
119 | 119 |
120 // add and saturate the results together | 120 // add and saturate the results together |
121 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); | 121 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); |
122 | 122 |
123 // filter the source buffer | 123 // filter the source buffer |
124 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); | 124 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); |
125 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); | 125 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); |
126 | 126 |
127 // multiply 2 adjacent elements with the filter and add the result | 127 // multiply 2 adjacent elements with the filter and add the result |
128 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters); | 128 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); |
129 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); | 129 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); |
130 | 130 |
131 // add and saturate the results together | 131 // add and saturate the results together |
132 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, | 132 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, |
133 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); | 133 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); |
134 | 134 |
135 // reading 2 strides of the next 16 bytes | 135 // reading 2 strides of the next 16 bytes |
136 // (part of it was being read by earlier read) | 136 // (part of it was being read by earlier read) |
137 srcReg32b2 = _mm256_castsi128_si256( | 137 srcReg32b2 = _mm256_castsi128_si256( |
138 _mm_loadu_si128((__m128i *)(src_ptr+5))); | 138 _mm_loadu_si128((__m128i *)(src_ptr+5))); |
139 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, | 139 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, |
140 _mm_loadu_si128((__m128i *) | 140 _mm_loadu_si128((__m128i *) |
141 (src_ptr+src_pixels_per_line+5)), 1); | 141 (src_ptr+src_pixels_per_line+5)), 1); |
142 | 142 |
143 // add and saturate the results together | 143 // add and saturate the results together |
144 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, | 144 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, |
145 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); | 145 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); |
146 | 146 |
147 // filter the source buffer | 147 // filter the source buffer |
148 srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); | 148 srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); |
149 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); | 149 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); |
150 | 150 |
151 // multiply 2 adjacent elements with the filter and add the result | 151 // multiply 2 adjacent elements with the filter and add the result |
152 srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); | 152 srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); |
153 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); | 153 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); |
154 | 154 |
155 // add and saturate the results together | 155 // add and saturate the results together |
156 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); | 156 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); |
157 | 157 |
158 // filter the source buffer | 158 // filter the source buffer |
159 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt4Reg); | 159 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg); |
160 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg); | 160 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg); |
161 | 161 |
162 // multiply 2 adjacent elements with the filter and add the result | 162 // multiply 2 adjacent elements with the filter and add the result |
163 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters); | 163 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); |
164 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); | 164 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); |
165 | 165 |
166 // add and saturate the results together | 166 // add and saturate the results together |
167 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, | 167 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, |
168 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); | 168 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); |
169 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, | 169 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, |
170 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); | 170 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); |
171 | 171 |
172 | 172 |
173 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); | 173 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); |
(...skipping 27 matching lines...) Expand all Loading... |
201 if (i > 0) { | 201 if (i > 0) { |
202 __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; | 202 __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; |
203 __m128i srcRegFilt2, srcRegFilt3; | 203 __m128i srcRegFilt2, srcRegFilt3; |
204 | 204 |
205 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); | 205 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); |
206 | 206 |
207 // filter the source buffer | 207 // filter the source buffer |
208 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, | 208 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, |
209 _mm256_castsi256_si128(filt1Reg)); | 209 _mm256_castsi256_si128(filt1Reg)); |
210 srcRegFilt2 = _mm_shuffle_epi8(srcReg1, | 210 srcRegFilt2 = _mm_shuffle_epi8(srcReg1, |
211 _mm256_castsi256_si128(filt2Reg)); | 211 _mm256_castsi256_si128(filt4Reg)); |
212 | 212 |
213 // multiply 2 adjacent elements with the filter and add the result | 213 // multiply 2 adjacent elements with the filter and add the result |
214 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, | 214 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, |
215 _mm256_castsi256_si128(firstFilters)); | 215 _mm256_castsi256_si128(firstFilters)); |
216 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, | 216 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, |
217 _mm256_castsi256_si128(secondFilters)); | 217 _mm256_castsi256_si128(forthFilters)); |
218 | 218 |
219 // add and saturate the results together | 219 // add and saturate the results together |
220 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); | 220 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); |
221 | 221 |
222 // filter the source buffer | 222 // filter the source buffer |
223 srcRegFilt3= _mm_shuffle_epi8(srcReg1, | 223 srcRegFilt3= _mm_shuffle_epi8(srcReg1, |
224 _mm256_castsi256_si128(filt4Reg)); | 224 _mm256_castsi256_si128(filt2Reg)); |
225 srcRegFilt2= _mm_shuffle_epi8(srcReg1, | 225 srcRegFilt2= _mm_shuffle_epi8(srcReg1, |
226 _mm256_castsi256_si128(filt3Reg)); | 226 _mm256_castsi256_si128(filt3Reg)); |
227 | 227 |
228 // multiply 2 adjacent elements with the filter and add the result | 228 // multiply 2 adjacent elements with the filter and add the result |
229 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, | 229 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, |
230 _mm256_castsi256_si128(forthFilters)); | 230 _mm256_castsi256_si128(secondFilters)); |
231 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, | 231 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, |
232 _mm256_castsi256_si128(thirdFilters)); | 232 _mm256_castsi256_si128(thirdFilters)); |
233 | 233 |
234 // add and saturate the results together | 234 // add and saturate the results together |
235 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, | 235 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, |
236 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); | 236 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); |
237 | 237 |
238 // reading the next 16 bytes | 238 // reading the next 16 bytes |
239 // (part of it was being read by earlier read) | 239 // (part of it was being read by earlier read) |
240 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); | 240 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); |
241 | 241 |
242 // add and saturate the results together | 242 // add and saturate the results together |
243 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, | 243 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, |
244 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); | 244 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); |
245 | 245 |
246 // filter the source buffer | 246 // filter the source buffer |
247 srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, | 247 srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, |
248 _mm256_castsi256_si128(filt1Reg)); | 248 _mm256_castsi256_si128(filt1Reg)); |
249 srcRegFilt2 = _mm_shuffle_epi8(srcReg2, | 249 srcRegFilt2 = _mm_shuffle_epi8(srcReg2, |
250 _mm256_castsi256_si128(filt2Reg)); | 250 _mm256_castsi256_si128(filt4Reg)); |
251 | 251 |
252 // multiply 2 adjacent elements with the filter and add the result | 252 // multiply 2 adjacent elements with the filter and add the result |
253 srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, | 253 srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, |
254 _mm256_castsi256_si128(firstFilters)); | 254 _mm256_castsi256_si128(firstFilters)); |
255 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, | 255 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, |
256 _mm256_castsi256_si128(secondFilters)); | 256 _mm256_castsi256_si128(forthFilters)); |
257 | 257 |
258 // add and saturate the results together | 258 // add and saturate the results together |
259 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); | 259 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); |
260 | 260 |
261 // filter the source buffer | 261 // filter the source buffer |
262 srcRegFilt3 = _mm_shuffle_epi8(srcReg2, | 262 srcRegFilt3 = _mm_shuffle_epi8(srcReg2, |
263 _mm256_castsi256_si128(filt4Reg)); | 263 _mm256_castsi256_si128(filt2Reg)); |
264 srcRegFilt2 = _mm_shuffle_epi8(srcReg2, | 264 srcRegFilt2 = _mm_shuffle_epi8(srcReg2, |
265 _mm256_castsi256_si128(filt3Reg)); | 265 _mm256_castsi256_si128(filt3Reg)); |
266 | 266 |
267 // multiply 2 adjacent elements with the filter and add the result | 267 // multiply 2 adjacent elements with the filter and add the result |
268 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, | 268 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, |
269 _mm256_castsi256_si128(forthFilters)); | 269 _mm256_castsi256_si128(secondFilters)); |
270 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, | 270 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, |
271 _mm256_castsi256_si128(thirdFilters)); | 271 _mm256_castsi256_si128(thirdFilters)); |
272 | 272 |
273 // add and saturate the results together | 273 // add and saturate the results together |
274 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, | 274 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, |
275 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); | 275 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); |
276 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, | 276 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, |
277 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); | 277 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); |
278 | 278 |
279 | 279 |
(...skipping 255 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
535 | 535 |
536 // shrink to 8 bit each 16 bits, the first lane contain the first | 536 // shrink to 8 bit each 16 bits, the first lane contain the first |
537 // convolve result and the second lane contain the second convolve | 537 // convolve result and the second lane contain the second convolve |
538 // result | 538 // result |
539 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); | 539 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); |
540 | 540 |
541 // save 16 bytes | 541 // save 16 bytes |
542 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); | 542 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); |
543 } | 543 } |
544 } | 544 } |
OLD | NEW |