| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 93 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 104 for (i = output_height; i > 1; i-=2) { | 104 for (i = output_height; i > 1; i-=2) { |
| 105 // load the 2 strides of source | 105 // load the 2 strides of source |
| 106 srcReg32b1 = _mm256_castsi128_si256( | 106 srcReg32b1 = _mm256_castsi128_si256( |
| 107 _mm_loadu_si128((__m128i *)(src_ptr-3))); | 107 _mm_loadu_si128((__m128i *)(src_ptr-3))); |
| 108 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, | 108 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, |
| 109 _mm_loadu_si128((__m128i *) | 109 _mm_loadu_si128((__m128i *) |
| 110 (src_ptr+src_pixels_per_line-3)), 1); | 110 (src_ptr+src_pixels_per_line-3)), 1); |
| 111 | 111 |
| 112 // filter the source buffer | 112 // filter the source buffer |
| 113 srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); | 113 srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); |
| 114 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); | 114 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); |
| 115 | 115 |
| 116 // multiply 2 adjacent elements with the filter and add the result | 116 // multiply 2 adjacent elements with the filter and add the result |
| 117 srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); | 117 srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); |
| 118 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); | 118 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); |
| 119 | 119 |
| 120 // add and saturate the results together | 120 // add and saturate the results together |
| 121 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); | 121 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); |
| 122 | 122 |
| 123 // filter the source buffer | 123 // filter the source buffer |
| 124 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); | 124 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); |
| 125 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); | 125 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); |
| 126 | 126 |
| 127 // multiply 2 adjacent elements with the filter and add the result | 127 // multiply 2 adjacent elements with the filter and add the result |
| 128 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters); | 128 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); |
| 129 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); | 129 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); |
| 130 | 130 |
| 131 // add and saturate the results together | 131 // add and saturate the results together |
| 132 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, | 132 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, |
| 133 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); | 133 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); |
| 134 | 134 |
| 135 // reading 2 strides of the next 16 bytes | 135 // reading 2 strides of the next 16 bytes |
| 136 // (part of it was being read by earlier read) | 136 // (part of it was being read by earlier read) |
| 137 srcReg32b2 = _mm256_castsi128_si256( | 137 srcReg32b2 = _mm256_castsi128_si256( |
| 138 _mm_loadu_si128((__m128i *)(src_ptr+5))); | 138 _mm_loadu_si128((__m128i *)(src_ptr+5))); |
| 139 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, | 139 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, |
| 140 _mm_loadu_si128((__m128i *) | 140 _mm_loadu_si128((__m128i *) |
| 141 (src_ptr+src_pixels_per_line+5)), 1); | 141 (src_ptr+src_pixels_per_line+5)), 1); |
| 142 | 142 |
| 143 // add and saturate the results together | 143 // add and saturate the results together |
| 144 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, | 144 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, |
| 145 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); | 145 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); |
| 146 | 146 |
| 147 // filter the source buffer | 147 // filter the source buffer |
| 148 srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); | 148 srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); |
| 149 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); | 149 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); |
| 150 | 150 |
| 151 // multiply 2 adjacent elements with the filter and add the result | 151 // multiply 2 adjacent elements with the filter and add the result |
| 152 srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); | 152 srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); |
| 153 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); | 153 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); |
| 154 | 154 |
| 155 // add and saturate the results together | 155 // add and saturate the results together |
| 156 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); | 156 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); |
| 157 | 157 |
| 158 // filter the source buffer | 158 // filter the source buffer |
| 159 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt4Reg); | 159 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg); |
| 160 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg); | 160 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg); |
| 161 | 161 |
| 162 // multiply 2 adjacent elements with the filter and add the result | 162 // multiply 2 adjacent elements with the filter and add the result |
| 163 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters); | 163 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); |
| 164 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); | 164 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); |
| 165 | 165 |
| 166 // add and saturate the results together | 166 // add and saturate the results together |
| 167 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, | 167 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, |
| 168 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); | 168 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); |
| 169 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, | 169 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, |
| 170 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); | 170 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); |
| 171 | 171 |
| 172 | 172 |
| 173 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); | 173 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); |
| (...skipping 27 matching lines...) Expand all Loading... |
| 201 if (i > 0) { | 201 if (i > 0) { |
| 202 __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; | 202 __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; |
| 203 __m128i srcRegFilt2, srcRegFilt3; | 203 __m128i srcRegFilt2, srcRegFilt3; |
| 204 | 204 |
| 205 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); | 205 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); |
| 206 | 206 |
| 207 // filter the source buffer | 207 // filter the source buffer |
| 208 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, | 208 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, |
| 209 _mm256_castsi256_si128(filt1Reg)); | 209 _mm256_castsi256_si128(filt1Reg)); |
| 210 srcRegFilt2 = _mm_shuffle_epi8(srcReg1, | 210 srcRegFilt2 = _mm_shuffle_epi8(srcReg1, |
| 211 _mm256_castsi256_si128(filt2Reg)); | 211 _mm256_castsi256_si128(filt4Reg)); |
| 212 | 212 |
| 213 // multiply 2 adjacent elements with the filter and add the result | 213 // multiply 2 adjacent elements with the filter and add the result |
| 214 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, | 214 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, |
| 215 _mm256_castsi256_si128(firstFilters)); | 215 _mm256_castsi256_si128(firstFilters)); |
| 216 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, | 216 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, |
| 217 _mm256_castsi256_si128(secondFilters)); | 217 _mm256_castsi256_si128(forthFilters)); |
| 218 | 218 |
| 219 // add and saturate the results together | 219 // add and saturate the results together |
| 220 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); | 220 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); |
| 221 | 221 |
| 222 // filter the source buffer | 222 // filter the source buffer |
| 223 srcRegFilt3= _mm_shuffle_epi8(srcReg1, | 223 srcRegFilt3= _mm_shuffle_epi8(srcReg1, |
| 224 _mm256_castsi256_si128(filt4Reg)); | 224 _mm256_castsi256_si128(filt2Reg)); |
| 225 srcRegFilt2= _mm_shuffle_epi8(srcReg1, | 225 srcRegFilt2= _mm_shuffle_epi8(srcReg1, |
| 226 _mm256_castsi256_si128(filt3Reg)); | 226 _mm256_castsi256_si128(filt3Reg)); |
| 227 | 227 |
| 228 // multiply 2 adjacent elements with the filter and add the result | 228 // multiply 2 adjacent elements with the filter and add the result |
| 229 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, | 229 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, |
| 230 _mm256_castsi256_si128(forthFilters)); | 230 _mm256_castsi256_si128(secondFilters)); |
| 231 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, | 231 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, |
| 232 _mm256_castsi256_si128(thirdFilters)); | 232 _mm256_castsi256_si128(thirdFilters)); |
| 233 | 233 |
| 234 // add and saturate the results together | 234 // add and saturate the results together |
| 235 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, | 235 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, |
| 236 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); | 236 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); |
| 237 | 237 |
| 238 // reading the next 16 bytes | 238 // reading the next 16 bytes |
| 239 // (part of it was being read by earlier read) | 239 // (part of it was being read by earlier read) |
| 240 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); | 240 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); |
| 241 | 241 |
| 242 // add and saturate the results together | 242 // add and saturate the results together |
| 243 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, | 243 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, |
| 244 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); | 244 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); |
| 245 | 245 |
| 246 // filter the source buffer | 246 // filter the source buffer |
| 247 srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, | 247 srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, |
| 248 _mm256_castsi256_si128(filt1Reg)); | 248 _mm256_castsi256_si128(filt1Reg)); |
| 249 srcRegFilt2 = _mm_shuffle_epi8(srcReg2, | 249 srcRegFilt2 = _mm_shuffle_epi8(srcReg2, |
| 250 _mm256_castsi256_si128(filt2Reg)); | 250 _mm256_castsi256_si128(filt4Reg)); |
| 251 | 251 |
| 252 // multiply 2 adjacent elements with the filter and add the result | 252 // multiply 2 adjacent elements with the filter and add the result |
| 253 srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, | 253 srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, |
| 254 _mm256_castsi256_si128(firstFilters)); | 254 _mm256_castsi256_si128(firstFilters)); |
| 255 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, | 255 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, |
| 256 _mm256_castsi256_si128(secondFilters)); | 256 _mm256_castsi256_si128(forthFilters)); |
| 257 | 257 |
| 258 // add and saturate the results together | 258 // add and saturate the results together |
| 259 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); | 259 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); |
| 260 | 260 |
| 261 // filter the source buffer | 261 // filter the source buffer |
| 262 srcRegFilt3 = _mm_shuffle_epi8(srcReg2, | 262 srcRegFilt3 = _mm_shuffle_epi8(srcReg2, |
| 263 _mm256_castsi256_si128(filt4Reg)); | 263 _mm256_castsi256_si128(filt2Reg)); |
| 264 srcRegFilt2 = _mm_shuffle_epi8(srcReg2, | 264 srcRegFilt2 = _mm_shuffle_epi8(srcReg2, |
| 265 _mm256_castsi256_si128(filt3Reg)); | 265 _mm256_castsi256_si128(filt3Reg)); |
| 266 | 266 |
| 267 // multiply 2 adjacent elements with the filter and add the result | 267 // multiply 2 adjacent elements with the filter and add the result |
| 268 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, | 268 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, |
| 269 _mm256_castsi256_si128(forthFilters)); | 269 _mm256_castsi256_si128(secondFilters)); |
| 270 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, | 270 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, |
| 271 _mm256_castsi256_si128(thirdFilters)); | 271 _mm256_castsi256_si128(thirdFilters)); |
| 272 | 272 |
| 273 // add and saturate the results together | 273 // add and saturate the results together |
| 274 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, | 274 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, |
| 275 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); | 275 _mm_min_epi16(srcRegFilt3, srcRegFilt2)); |
| 276 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, | 276 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, |
| 277 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); | 277 _mm_max_epi16(srcRegFilt3, srcRegFilt2)); |
| 278 | 278 |
| 279 | 279 |
| (...skipping 255 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 535 | 535 |
| 536 // shrink to 8 bit each 16 bits, the first lane contain the first | 536 // shrink to 8 bit each 16 bits, the first lane contain the first |
| 537 // convolve result and the second lane contain the second convolve | 537 // convolve result and the second lane contain the second convolve |
| 538 // result | 538 // result |
| 539 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); | 539 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); |
| 540 | 540 |
| 541 // save 16 bytes | 541 // save 16 bytes |
| 542 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); | 542 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); |
| 543 } | 543 } |
| 544 } | 544 } |
| OLD | NEW |