| OLD | NEW |
| (Empty) | |
| 1 /* |
| 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
| 3 * |
| 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ |
| 10 |
| 11 #include <immintrin.h> // AVX2 |
| 12 #include "vpx_ports/mem.h" |
| 13 #include "vp9/encoder/vp9_variance.h" |
| 14 |
| 15 DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { |
| 16 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, |
| 17 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, |
| 18 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, |
| 19 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, |
| 20 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, |
| 21 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, |
| 22 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, |
| 23 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, |
| 24 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, |
| 25 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, |
| 26 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, |
| 27 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, |
| 28 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, |
| 29 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, |
| 30 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, |
| 31 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, |
| 32 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, |
| 33 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, |
| 34 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, |
| 35 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, |
| 36 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, |
| 37 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, |
| 38 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, |
| 39 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, |
| 40 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, |
| 41 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, |
| 42 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, |
| 43 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, |
| 44 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, |
| 45 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, |
| 46 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, |
| 47 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15 |
| 48 }; |
| 49 |
| 50 #define FILTER_SRC(filter) \ |
| 51 /* filter the source */ \ |
| 52 exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \ |
| 53 exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \ |
| 54 \ |
| 55 /* add 8 to source */ \ |
| 56 exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \ |
| 57 exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \ |
| 58 \ |
| 59 /* divide source by 16 */ \ |
| 60 exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \ |
| 61 exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); |
| 62 |
| 63 #define MERGE_WITH_SRC(src_reg, reg) \ |
| 64 exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \ |
| 65 exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg); |
| 66 |
| 67 #define LOAD_SRC_DST \ |
| 68 /* load source and destination */ \ |
| 69 src_reg = _mm256_loadu_si256((__m256i const *) (src)); \ |
| 70 dst_reg = _mm256_load_si256((__m256i const *) (dst)); |
| 71 |
| 72 #define AVG_NEXT_SRC(src_reg, size_stride) \ |
| 73 src_next_reg = _mm256_loadu_si256((__m256i const *) \ |
| 74 (src + size_stride)); \ |
| 75 /* average between current and next stride source */ \ |
| 76 src_reg = _mm256_avg_epu8(src_reg, src_next_reg); |
| 77 |
| 78 #define MERGE_NEXT_SRC(src_reg, size_stride) \ |
| 79 src_next_reg = _mm256_loadu_si256((__m256i const *) \ |
| 80 (src + size_stride)); \ |
| 81 MERGE_WITH_SRC(src_reg, src_next_reg) |
| 82 |
| 83 #define CALC_SUM_SSE_INSIDE_LOOP \ |
| 84 /* expand each byte to 2 bytes */ \ |
| 85 exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \ |
| 86 exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \ |
| 87 /* source - dest */ \ |
| 88 exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \ |
| 89 exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \ |
| 90 /* caculate sum */ \ |
| 91 sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \ |
| 92 exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \ |
| 93 sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \ |
| 94 exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \ |
| 95 /* calculate sse */ \ |
| 96 sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \ |
| 97 sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi); |
| 98 |
| 99 // final calculation to sum and sse |
| 100 #define CALC_SUM_AND_SSE \ |
| 101 res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \ |
| 102 sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \ |
| 103 sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \ |
| 104 sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \ |
| 105 sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ |
| 106 sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \ |
| 107 \ |
| 108 sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \ |
| 109 sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \ |
| 110 \ |
| 111 sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ |
| 112 sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ |
| 113 *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \ |
| 114 _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \ |
| 115 sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \ |
| 116 sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ |
| 117 sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \ |
| 118 _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1)); |
| 119 |
| 120 |
| 121 unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, |
| 122 int src_stride, |
| 123 int x_offset, |
| 124 int y_offset, |
| 125 const uint8_t *dst, |
| 126 int dst_stride, |
| 127 int height, |
| 128 unsigned int *sse) { |
| 129 __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; |
| 130 __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; |
| 131 __m256i zero_reg; |
| 132 int i, sum; |
| 133 sum_reg = _mm256_set1_epi16(0); |
| 134 sse_reg = _mm256_set1_epi16(0); |
| 135 zero_reg = _mm256_set1_epi16(0); |
| 136 |
| 137 // x_offset = 0 and y_offset = 0 |
| 138 if (x_offset == 0) { |
| 139 if (y_offset == 0) { |
| 140 for (i = 0; i < height ; i++) { |
| 141 LOAD_SRC_DST |
| 142 // expend each byte to 2 bytes |
| 143 MERGE_WITH_SRC(src_reg, zero_reg) |
| 144 CALC_SUM_SSE_INSIDE_LOOP |
| 145 src+= src_stride; |
| 146 dst+= dst_stride; |
| 147 } |
| 148 // x_offset = 0 and y_offset = 8 |
| 149 } else if (y_offset == 8) { |
| 150 __m256i src_next_reg; |
| 151 for (i = 0; i < height ; i++) { |
| 152 LOAD_SRC_DST |
| 153 AVG_NEXT_SRC(src_reg, src_stride) |
| 154 // expend each byte to 2 bytes |
| 155 MERGE_WITH_SRC(src_reg, zero_reg) |
| 156 CALC_SUM_SSE_INSIDE_LOOP |
| 157 src+= src_stride; |
| 158 dst+= dst_stride; |
| 159 } |
| 160 // x_offset = 0 and y_offset = bilin interpolation |
| 161 } else { |
| 162 __m256i filter, pw8, src_next_reg; |
| 163 |
| 164 y_offset <<= 5; |
| 165 filter = _mm256_load_si256((__m256i const *) |
| 166 (bilinear_filters_avx2 + y_offset)); |
| 167 pw8 = _mm256_set1_epi16(8); |
| 168 for (i = 0; i < height ; i++) { |
| 169 LOAD_SRC_DST |
| 170 MERGE_NEXT_SRC(src_reg, src_stride) |
| 171 FILTER_SRC(filter) |
| 172 CALC_SUM_SSE_INSIDE_LOOP |
| 173 src+= src_stride; |
| 174 dst+= dst_stride; |
| 175 } |
| 176 } |
| 177 // x_offset = 8 and y_offset = 0 |
| 178 } else if (x_offset == 8) { |
| 179 if (y_offset == 0) { |
| 180 __m256i src_next_reg; |
| 181 for (i = 0; i < height ; i++) { |
| 182 LOAD_SRC_DST |
| 183 AVG_NEXT_SRC(src_reg, 1) |
| 184 // expand each byte to 2 bytes |
| 185 MERGE_WITH_SRC(src_reg, zero_reg) |
| 186 CALC_SUM_SSE_INSIDE_LOOP |
| 187 src+= src_stride; |
| 188 dst+= dst_stride; |
| 189 } |
| 190 // x_offset = 8 and y_offset = 8 |
| 191 } else if (y_offset == 8) { |
| 192 __m256i src_next_reg, src_avg; |
| 193 // load source and another source starting from the next |
| 194 // following byte |
| 195 src_reg = _mm256_loadu_si256((__m256i const *) (src)); |
| 196 AVG_NEXT_SRC(src_reg, 1) |
| 197 for (i = 0; i < height ; i++) { |
| 198 src_avg = src_reg; |
| 199 src+= src_stride; |
| 200 LOAD_SRC_DST |
| 201 AVG_NEXT_SRC(src_reg, 1) |
| 202 // average between previous average to current average |
| 203 src_avg = _mm256_avg_epu8(src_avg, src_reg); |
| 204 // expand each byte to 2 bytes |
| 205 MERGE_WITH_SRC(src_avg, zero_reg) |
| 206 // save current source average |
| 207 CALC_SUM_SSE_INSIDE_LOOP |
| 208 dst+= dst_stride; |
| 209 } |
| 210 // x_offset = 8 and y_offset = bilin interpolation |
| 211 } else { |
| 212 __m256i filter, pw8, src_next_reg, src_avg; |
| 213 y_offset <<= 5; |
| 214 filter = _mm256_load_si256((__m256i const *) |
| 215 (bilinear_filters_avx2 + y_offset)); |
| 216 pw8 = _mm256_set1_epi16(8); |
| 217 // load source and another source starting from the next |
| 218 // following byte |
| 219 src_reg = _mm256_loadu_si256((__m256i const *) (src)); |
| 220 AVG_NEXT_SRC(src_reg, 1) |
| 221 for (i = 0; i < height ; i++) { |
| 222 // save current source average |
| 223 src_avg = src_reg; |
| 224 src+= src_stride; |
| 225 LOAD_SRC_DST |
| 226 AVG_NEXT_SRC(src_reg, 1) |
| 227 MERGE_WITH_SRC(src_avg, src_reg) |
| 228 FILTER_SRC(filter) |
| 229 CALC_SUM_SSE_INSIDE_LOOP |
| 230 dst+= dst_stride; |
| 231 } |
| 232 } |
| 233 // x_offset = bilin interpolation and y_offset = 0 |
| 234 } else { |
| 235 if (y_offset == 0) { |
| 236 __m256i filter, pw8, src_next_reg; |
| 237 x_offset <<= 5; |
| 238 filter = _mm256_load_si256((__m256i const *) |
| 239 (bilinear_filters_avx2 + x_offset)); |
| 240 pw8 = _mm256_set1_epi16(8); |
| 241 for (i = 0; i < height ; i++) { |
| 242 LOAD_SRC_DST |
| 243 MERGE_NEXT_SRC(src_reg, 1) |
| 244 FILTER_SRC(filter) |
| 245 CALC_SUM_SSE_INSIDE_LOOP |
| 246 src+= src_stride; |
| 247 dst+= dst_stride; |
| 248 } |
| 249 // x_offset = bilin interpolation and y_offset = 8 |
| 250 } else if (y_offset == 8) { |
| 251 __m256i filter, pw8, src_next_reg, src_pack; |
| 252 x_offset <<= 5; |
| 253 filter = _mm256_load_si256((__m256i const *) |
| 254 (bilinear_filters_avx2 + x_offset)); |
| 255 pw8 = _mm256_set1_epi16(8); |
| 256 src_reg = _mm256_loadu_si256((__m256i const *) (src)); |
| 257 MERGE_NEXT_SRC(src_reg, 1) |
| 258 FILTER_SRC(filter) |
| 259 // convert each 16 bit to 8 bit to each low and high lane source |
| 260 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
| 261 for (i = 0; i < height ; i++) { |
| 262 src+= src_stride; |
| 263 LOAD_SRC_DST |
| 264 MERGE_NEXT_SRC(src_reg, 1) |
| 265 FILTER_SRC(filter) |
| 266 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
| 267 // average between previous pack to the current |
| 268 src_pack = _mm256_avg_epu8(src_pack, src_reg); |
| 269 MERGE_WITH_SRC(src_pack, zero_reg) |
| 270 CALC_SUM_SSE_INSIDE_LOOP |
| 271 src_pack = src_reg; |
| 272 dst+= dst_stride; |
| 273 } |
| 274 // x_offset = bilin interpolation and y_offset = bilin interpolation |
| 275 } else { |
| 276 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; |
| 277 x_offset <<= 5; |
| 278 xfilter = _mm256_load_si256((__m256i const *) |
| 279 (bilinear_filters_avx2 + x_offset)); |
| 280 y_offset <<= 5; |
| 281 yfilter = _mm256_load_si256((__m256i const *) |
| 282 (bilinear_filters_avx2 + y_offset)); |
| 283 pw8 = _mm256_set1_epi16(8); |
| 284 // load source and another source starting from the next |
| 285 // following byte |
| 286 src_reg = _mm256_loadu_si256((__m256i const *) (src)); |
| 287 MERGE_NEXT_SRC(src_reg, 1) |
| 288 |
| 289 FILTER_SRC(xfilter) |
| 290 // convert each 16 bit to 8 bit to each low and high lane source |
| 291 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
| 292 for (i = 0; i < height ; i++) { |
| 293 src+= src_stride; |
| 294 LOAD_SRC_DST |
| 295 MERGE_NEXT_SRC(src_reg, 1) |
| 296 FILTER_SRC(xfilter) |
| 297 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
| 298 // merge previous pack to current pack source |
| 299 MERGE_WITH_SRC(src_pack, src_reg) |
| 300 // filter the source |
| 301 FILTER_SRC(yfilter) |
| 302 src_pack = src_reg; |
| 303 CALC_SUM_SSE_INSIDE_LOOP |
| 304 dst+= dst_stride; |
| 305 } |
| 306 } |
| 307 } |
| 308 CALC_SUM_AND_SSE |
| 309 return sum; |
| 310 } |
| 311 |
| 312 unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, |
| 313 int src_stride, |
| 314 int x_offset, |
| 315 int y_offset, |
| 316 const uint8_t *dst, |
| 317 int dst_stride, |
| 318 const uint8_t *sec, |
| 319 int sec_stride, |
| 320 int height, |
| 321 unsigned int *sse) { |
| 322 __m256i sec_reg; |
| 323 __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; |
| 324 __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; |
| 325 __m256i zero_reg; |
| 326 int i, sum; |
| 327 sum_reg = _mm256_set1_epi16(0); |
| 328 sse_reg = _mm256_set1_epi16(0); |
| 329 zero_reg = _mm256_set1_epi16(0); |
| 330 |
| 331 // x_offset = 0 and y_offset = 0 |
| 332 if (x_offset == 0) { |
| 333 if (y_offset == 0) { |
| 334 for (i = 0; i < height ; i++) { |
| 335 LOAD_SRC_DST |
| 336 sec_reg = _mm256_load_si256((__m256i const *) (sec)); |
| 337 src_reg = _mm256_avg_epu8(src_reg, sec_reg); |
| 338 sec+= sec_stride; |
| 339 // expend each byte to 2 bytes |
| 340 MERGE_WITH_SRC(src_reg, zero_reg) |
| 341 CALC_SUM_SSE_INSIDE_LOOP |
| 342 src+= src_stride; |
| 343 dst+= dst_stride; |
| 344 } |
| 345 } else if (y_offset == 8) { |
| 346 __m256i src_next_reg; |
| 347 for (i = 0; i < height ; i++) { |
| 348 LOAD_SRC_DST |
| 349 AVG_NEXT_SRC(src_reg, src_stride) |
| 350 sec_reg = _mm256_load_si256((__m256i const *) (sec)); |
| 351 src_reg = _mm256_avg_epu8(src_reg, sec_reg); |
| 352 sec+= sec_stride; |
| 353 // expend each byte to 2 bytes |
| 354 MERGE_WITH_SRC(src_reg, zero_reg) |
| 355 CALC_SUM_SSE_INSIDE_LOOP |
| 356 src+= src_stride; |
| 357 dst+= dst_stride; |
| 358 } |
| 359 // x_offset = 0 and y_offset = bilin interpolation |
| 360 } else { |
| 361 __m256i filter, pw8, src_next_reg; |
| 362 |
| 363 y_offset <<= 5; |
| 364 filter = _mm256_load_si256((__m256i const *) |
| 365 (bilinear_filters_avx2 + y_offset)); |
| 366 pw8 = _mm256_set1_epi16(8); |
| 367 for (i = 0; i < height ; i++) { |
| 368 LOAD_SRC_DST |
| 369 MERGE_NEXT_SRC(src_reg, src_stride) |
| 370 FILTER_SRC(filter) |
| 371 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
| 372 sec_reg = _mm256_load_si256((__m256i const *) (sec)); |
| 373 src_reg = _mm256_avg_epu8(src_reg, sec_reg); |
| 374 sec+= sec_stride; |
| 375 MERGE_WITH_SRC(src_reg, zero_reg) |
| 376 CALC_SUM_SSE_INSIDE_LOOP |
| 377 src+= src_stride; |
| 378 dst+= dst_stride; |
| 379 } |
| 380 } |
| 381 // x_offset = 8 and y_offset = 0 |
| 382 } else if (x_offset == 8) { |
| 383 if (y_offset == 0) { |
| 384 __m256i src_next_reg; |
| 385 for (i = 0; i < height ; i++) { |
| 386 LOAD_SRC_DST |
| 387 AVG_NEXT_SRC(src_reg, 1) |
| 388 sec_reg = _mm256_load_si256((__m256i const *) (sec)); |
| 389 src_reg = _mm256_avg_epu8(src_reg, sec_reg); |
| 390 sec+= sec_stride; |
| 391 // expand each byte to 2 bytes |
| 392 MERGE_WITH_SRC(src_reg, zero_reg) |
| 393 CALC_SUM_SSE_INSIDE_LOOP |
| 394 src+= src_stride; |
| 395 dst+= dst_stride; |
| 396 } |
| 397 // x_offset = 8 and y_offset = 8 |
| 398 } else if (y_offset == 8) { |
| 399 __m256i src_next_reg, src_avg; |
| 400 // load source and another source starting from the next |
| 401 // following byte |
| 402 src_reg = _mm256_loadu_si256((__m256i const *) (src)); |
| 403 AVG_NEXT_SRC(src_reg, 1) |
| 404 for (i = 0; i < height ; i++) { |
| 405 // save current source average |
| 406 src_avg = src_reg; |
| 407 src+= src_stride; |
| 408 LOAD_SRC_DST |
| 409 AVG_NEXT_SRC(src_reg, 1) |
| 410 // average between previous average to current average |
| 411 src_avg = _mm256_avg_epu8(src_avg, src_reg); |
| 412 sec_reg = _mm256_load_si256((__m256i const *) (sec)); |
| 413 src_avg = _mm256_avg_epu8(src_avg, sec_reg); |
| 414 sec+= sec_stride; |
| 415 // expand each byte to 2 bytes |
| 416 MERGE_WITH_SRC(src_avg, zero_reg) |
| 417 CALC_SUM_SSE_INSIDE_LOOP |
| 418 dst+= dst_stride; |
| 419 } |
| 420 // x_offset = 8 and y_offset = bilin interpolation |
| 421 } else { |
| 422 __m256i filter, pw8, src_next_reg, src_avg; |
| 423 y_offset <<= 5; |
| 424 filter = _mm256_load_si256((__m256i const *) |
| 425 (bilinear_filters_avx2 + y_offset)); |
| 426 pw8 = _mm256_set1_epi16(8); |
| 427 // load source and another source starting from the next |
| 428 // following byte |
| 429 src_reg = _mm256_loadu_si256((__m256i const *) (src)); |
| 430 AVG_NEXT_SRC(src_reg, 1) |
| 431 for (i = 0; i < height ; i++) { |
| 432 // save current source average |
| 433 src_avg = src_reg; |
| 434 src+= src_stride; |
| 435 LOAD_SRC_DST |
| 436 AVG_NEXT_SRC(src_reg, 1) |
| 437 MERGE_WITH_SRC(src_avg, src_reg) |
| 438 FILTER_SRC(filter) |
| 439 src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
| 440 sec_reg = _mm256_load_si256((__m256i const *) (sec)); |
| 441 src_avg = _mm256_avg_epu8(src_avg, sec_reg); |
| 442 // expand each byte to 2 bytes |
| 443 MERGE_WITH_SRC(src_avg, zero_reg) |
| 444 sec+= sec_stride; |
| 445 CALC_SUM_SSE_INSIDE_LOOP |
| 446 dst+= dst_stride; |
| 447 } |
| 448 } |
| 449 // x_offset = bilin interpolation and y_offset = 0 |
| 450 } else { |
| 451 if (y_offset == 0) { |
| 452 __m256i filter, pw8, src_next_reg; |
| 453 x_offset <<= 5; |
| 454 filter = _mm256_load_si256((__m256i const *) |
| 455 (bilinear_filters_avx2 + x_offset)); |
| 456 pw8 = _mm256_set1_epi16(8); |
| 457 for (i = 0; i < height ; i++) { |
| 458 LOAD_SRC_DST |
| 459 MERGE_NEXT_SRC(src_reg, 1) |
| 460 FILTER_SRC(filter) |
| 461 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
| 462 sec_reg = _mm256_load_si256((__m256i const *) (sec)); |
| 463 src_reg = _mm256_avg_epu8(src_reg, sec_reg); |
| 464 MERGE_WITH_SRC(src_reg, zero_reg) |
| 465 sec+= sec_stride; |
| 466 CALC_SUM_SSE_INSIDE_LOOP |
| 467 src+= src_stride; |
| 468 dst+= dst_stride; |
| 469 } |
| 470 // x_offset = bilin interpolation and y_offset = 8 |
| 471 } else if (y_offset == 8) { |
| 472 __m256i filter, pw8, src_next_reg, src_pack; |
| 473 x_offset <<= 5; |
| 474 filter = _mm256_load_si256((__m256i const *) |
| 475 (bilinear_filters_avx2 + x_offset)); |
| 476 pw8 = _mm256_set1_epi16(8); |
| 477 src_reg = _mm256_loadu_si256((__m256i const *) (src)); |
| 478 MERGE_NEXT_SRC(src_reg, 1) |
| 479 FILTER_SRC(filter) |
| 480 // convert each 16 bit to 8 bit to each low and high lane source |
| 481 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
| 482 for (i = 0; i < height ; i++) { |
| 483 src+= src_stride; |
| 484 LOAD_SRC_DST |
| 485 MERGE_NEXT_SRC(src_reg, 1) |
| 486 FILTER_SRC(filter) |
| 487 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
| 488 // average between previous pack to the current |
| 489 src_pack = _mm256_avg_epu8(src_pack, src_reg); |
| 490 sec_reg = _mm256_load_si256((__m256i const *) (sec)); |
| 491 src_pack = _mm256_avg_epu8(src_pack, sec_reg); |
| 492 sec+= sec_stride; |
| 493 MERGE_WITH_SRC(src_pack, zero_reg) |
| 494 src_pack = src_reg; |
| 495 CALC_SUM_SSE_INSIDE_LOOP |
| 496 dst+= dst_stride; |
| 497 } |
| 498 // x_offset = bilin interpolation and y_offset = bilin interpolation |
| 499 } else { |
| 500 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; |
| 501 x_offset <<= 5; |
| 502 xfilter = _mm256_load_si256((__m256i const *) |
| 503 (bilinear_filters_avx2 + x_offset)); |
| 504 y_offset <<= 5; |
| 505 yfilter = _mm256_load_si256((__m256i const *) |
| 506 (bilinear_filters_avx2 + y_offset)); |
| 507 pw8 = _mm256_set1_epi16(8); |
| 508 // load source and another source starting from the next |
| 509 // following byte |
| 510 src_reg = _mm256_loadu_si256((__m256i const *) (src)); |
| 511 MERGE_NEXT_SRC(src_reg, 1) |
| 512 |
| 513 FILTER_SRC(xfilter) |
| 514 // convert each 16 bit to 8 bit to each low and high lane source |
| 515 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
| 516 for (i = 0; i < height ; i++) { |
| 517 src+= src_stride; |
| 518 LOAD_SRC_DST |
| 519 MERGE_NEXT_SRC(src_reg, 1) |
| 520 FILTER_SRC(xfilter) |
| 521 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
| 522 // merge previous pack to current pack source |
| 523 MERGE_WITH_SRC(src_pack, src_reg) |
| 524 // filter the source |
| 525 FILTER_SRC(yfilter) |
| 526 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); |
| 527 sec_reg = _mm256_load_si256((__m256i const *) (sec)); |
| 528 src_pack = _mm256_avg_epu8(src_pack, sec_reg); |
| 529 MERGE_WITH_SRC(src_pack, zero_reg) |
| 530 src_pack = src_reg; |
| 531 sec+= sec_stride; |
| 532 CALC_SUM_SSE_INSIDE_LOOP |
| 533 dst+= dst_stride; |
| 534 } |
| 535 } |
| 536 } |
| 537 CALC_SUM_AND_SSE |
| 538 return sum; |
| 539 } |
| OLD | NEW |