source/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c - Issue 181493009: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c

Issue 181493009: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 6 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 /*

	2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.

	3 *

	4 * Use of this source code is governed by a BSD-style license

	5 * that can be found in the LICENSE file in the root of the source

	6 * tree. An additional intellectual property rights grant can be found

	7 * in the file PATENTS. All contributing project authors may

	8 * be found in the AUTHORS file in the root of the source tree.

	9 */

	10

	11 #include <immintrin.h> // AVX2

	12 #include "vpx_ports/mem.h"

	13 #include "vp9/encoder/vp9_variance.h"

	14

	15 DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {

	16 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,

	17 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,

	18 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1,

	19 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1,

	20 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,

	21 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,

	22 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3,

	23 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3,

	24 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,

	25 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,

	26 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5,

	27 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5,

	28 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,

	29 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,

	30 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7,

	31 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7,

	32 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,

	33 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,

	34 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9,

	35 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9,

	36 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,

	37 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,

	38 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11,

	39 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11,

	40 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,

	41 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,

	42 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13,

	43 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13,

	44 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,

	45 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,

	46 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15,

	47 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15

	48 };

	49

	50 #define FILTER_SRC(filter) \

	51 /* filter the source */ \

	52 exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \

	53 exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \

	54 \

	55 /* add 8 to source */ \

	56 exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \

	57 exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \

	58 \

	59 /* divide source by 16 */ \

	60 exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \

	61 exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);

	62

	63 #define MERGE_WITH_SRC(src_reg, reg) \

	64 exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \

	65 exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);

	66

	67 #define LOAD_SRC_DST \

	68 /* load source and destination */ \

	69 src_reg = _mm256_loadu_si256((__m256i const *) (src)); \

	70 dst_reg = _mm256_load_si256((__m256i const *) (dst));

	71

	72 #define AVG_NEXT_SRC(src_reg, size_stride) \

	73 src_next_reg = _mm256_loadu_si256((__m256i const *) \

	74 (src + size_stride)); \

	75 /* average between current and next stride source */ \

	76 src_reg = _mm256_avg_epu8(src_reg, src_next_reg);

	77

	78 #define MERGE_NEXT_SRC(src_reg, size_stride) \

	79 src_next_reg = _mm256_loadu_si256((__m256i const *) \

	80 (src + size_stride)); \

	81 MERGE_WITH_SRC(src_reg, src_next_reg)

	82

	83 #define CALC_SUM_SSE_INSIDE_LOOP \

	84 /* expand each byte to 2 bytes */ \

	85 exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \

	86 exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \

	87 /* source - dest */ \

	88 exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \

	89 exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \

	90 /* caculate sum */ \

	91 sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \

	92 exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \

	93 sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \

	94 exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \

	95 /* calculate sse */ \

	96 sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \

	97 sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);

	98

	99 // final calculation to sum and sse

	100 #define CALC_SUM_AND_SSE \

	101 res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \

	102 sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \

	103 sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \

	104 sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \

	105 sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \

	106 sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \

	107 \

	108 sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \

	109 sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \

	110 \

	111 sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \

	112 sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \

	113 ((int)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \

	114 _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \

	115 sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \

	116 sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \

	117 sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \

	118 _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));

	119

	120

	121 unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src,

	122 int src_stride,

	123 int x_offset,

	124 int y_offset,

	125 const uint8_t *dst,

	126 int dst_stride,

	127 int height,

	128 unsigned int *sse) {

	129 __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;

	130 __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;

	131 __m256i zero_reg;

	132 int i, sum;

	133 sum_reg = _mm256_set1_epi16(0);

	134 sse_reg = _mm256_set1_epi16(0);

	135 zero_reg = _mm256_set1_epi16(0);

	136

	137 // x_offset = 0 and y_offset = 0

	138 if (x_offset == 0) {

	139 if (y_offset == 0) {

	140 for (i = 0; i < height ; i++) {

	141 LOAD_SRC_DST

	142 // expend each byte to 2 bytes

	143 MERGE_WITH_SRC(src_reg, zero_reg)

	144 CALC_SUM_SSE_INSIDE_LOOP

	145 src+= src_stride;

	146 dst+= dst_stride;

	147 }

	148 // x_offset = 0 and y_offset = 8

	149 } else if (y_offset == 8) {

	150 __m256i src_next_reg;

	151 for (i = 0; i < height ; i++) {

	152 LOAD_SRC_DST

	153 AVG_NEXT_SRC(src_reg, src_stride)

	154 // expend each byte to 2 bytes

	155 MERGE_WITH_SRC(src_reg, zero_reg)

	156 CALC_SUM_SSE_INSIDE_LOOP

	157 src+= src_stride;

	158 dst+= dst_stride;

	159 }

	160 // x_offset = 0 and y_offset = bilin interpolation

	161 } else {

	162 __m256i filter, pw8, src_next_reg;

	163

	164 y_offset <<= 5;

	165 filter = _mm256_load_si256((__m256i const *)

	166 (bilinear_filters_avx2 + y_offset));

	167 pw8 = _mm256_set1_epi16(8);

	168 for (i = 0; i < height ; i++) {

	169 LOAD_SRC_DST

	170 MERGE_NEXT_SRC(src_reg, src_stride)

	171 FILTER_SRC(filter)

	172 CALC_SUM_SSE_INSIDE_LOOP

	173 src+= src_stride;

	174 dst+= dst_stride;

	175 }

	176 }

	177 // x_offset = 8 and y_offset = 0

	178 } else if (x_offset == 8) {

	179 if (y_offset == 0) {

	180 __m256i src_next_reg;

	181 for (i = 0; i < height ; i++) {

	182 LOAD_SRC_DST

	183 AVG_NEXT_SRC(src_reg, 1)

	184 // expand each byte to 2 bytes

	185 MERGE_WITH_SRC(src_reg, zero_reg)

	186 CALC_SUM_SSE_INSIDE_LOOP

	187 src+= src_stride;

	188 dst+= dst_stride;

	189 }

	190 // x_offset = 8 and y_offset = 8

	191 } else if (y_offset == 8) {

	192 __m256i src_next_reg, src_avg;

	193 // load source and another source starting from the next

	194 // following byte

	195 src_reg = _mm256_loadu_si256((__m256i const *) (src));

	196 AVG_NEXT_SRC(src_reg, 1)

	197 for (i = 0; i < height ; i++) {

	198 src_avg = src_reg;

	199 src+= src_stride;

	200 LOAD_SRC_DST

	201 AVG_NEXT_SRC(src_reg, 1)

	202 // average between previous average to current average

	203 src_avg = _mm256_avg_epu8(src_avg, src_reg);

	204 // expand each byte to 2 bytes

	205 MERGE_WITH_SRC(src_avg, zero_reg)

	206 // save current source average

	207 CALC_SUM_SSE_INSIDE_LOOP

	208 dst+= dst_stride;

	209 }

	210 // x_offset = 8 and y_offset = bilin interpolation

	211 } else {

	212 __m256i filter, pw8, src_next_reg, src_avg;

	213 y_offset <<= 5;

	214 filter = _mm256_load_si256((__m256i const *)

	215 (bilinear_filters_avx2 + y_offset));

	216 pw8 = _mm256_set1_epi16(8);

	217 // load source and another source starting from the next

	218 // following byte

	219 src_reg = _mm256_loadu_si256((__m256i const *) (src));

	220 AVG_NEXT_SRC(src_reg, 1)

	221 for (i = 0; i < height ; i++) {

	222 // save current source average

	223 src_avg = src_reg;

	224 src+= src_stride;

	225 LOAD_SRC_DST

	226 AVG_NEXT_SRC(src_reg, 1)

	227 MERGE_WITH_SRC(src_avg, src_reg)

	228 FILTER_SRC(filter)

	229 CALC_SUM_SSE_INSIDE_LOOP

	230 dst+= dst_stride;

	231 }

	232 }

	233 // x_offset = bilin interpolation and y_offset = 0

	234 } else {

	235 if (y_offset == 0) {

	236 __m256i filter, pw8, src_next_reg;

	237 x_offset <<= 5;

	238 filter = _mm256_load_si256((__m256i const *)

	239 (bilinear_filters_avx2 + x_offset));

	240 pw8 = _mm256_set1_epi16(8);

	241 for (i = 0; i < height ; i++) {

	242 LOAD_SRC_DST

	243 MERGE_NEXT_SRC(src_reg, 1)

	244 FILTER_SRC(filter)

	245 CALC_SUM_SSE_INSIDE_LOOP

	246 src+= src_stride;

	247 dst+= dst_stride;

	248 }

	249 // x_offset = bilin interpolation and y_offset = 8

	250 } else if (y_offset == 8) {

	251 __m256i filter, pw8, src_next_reg, src_pack;

	252 x_offset <<= 5;

	253 filter = _mm256_load_si256((__m256i const *)

	254 (bilinear_filters_avx2 + x_offset));

	255 pw8 = _mm256_set1_epi16(8);

	256 src_reg = _mm256_loadu_si256((__m256i const *) (src));

	257 MERGE_NEXT_SRC(src_reg, 1)

	258 FILTER_SRC(filter)

	259 // convert each 16 bit to 8 bit to each low and high lane source

	260 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

	261 for (i = 0; i < height ; i++) {

	262 src+= src_stride;

	263 LOAD_SRC_DST

	264 MERGE_NEXT_SRC(src_reg, 1)

	265 FILTER_SRC(filter)

	266 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

	267 // average between previous pack to the current

	268 src_pack = _mm256_avg_epu8(src_pack, src_reg);

	269 MERGE_WITH_SRC(src_pack, zero_reg)

	270 CALC_SUM_SSE_INSIDE_LOOP

	271 src_pack = src_reg;

	272 dst+= dst_stride;

	273 }

	274 // x_offset = bilin interpolation and y_offset = bilin interpolation

	275 } else {

	276 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;

	277 x_offset <<= 5;

	278 xfilter = _mm256_load_si256((__m256i const *)

	279 (bilinear_filters_avx2 + x_offset));

	280 y_offset <<= 5;

	281 yfilter = _mm256_load_si256((__m256i const *)

	282 (bilinear_filters_avx2 + y_offset));

	283 pw8 = _mm256_set1_epi16(8);

	284 // load source and another source starting from the next

	285 // following byte

	286 src_reg = _mm256_loadu_si256((__m256i const *) (src));

	287 MERGE_NEXT_SRC(src_reg, 1)

	288

	289 FILTER_SRC(xfilter)

	290 // convert each 16 bit to 8 bit to each low and high lane source

	291 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

	292 for (i = 0; i < height ; i++) {

	293 src+= src_stride;

	294 LOAD_SRC_DST

	295 MERGE_NEXT_SRC(src_reg, 1)

	296 FILTER_SRC(xfilter)

	297 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

	298 // merge previous pack to current pack source

	299 MERGE_WITH_SRC(src_pack, src_reg)

	300 // filter the source

	301 FILTER_SRC(yfilter)

	302 src_pack = src_reg;

	303 CALC_SUM_SSE_INSIDE_LOOP

	304 dst+= dst_stride;

	305 }

	306 }

	307 }

	308 CALC_SUM_AND_SSE

	309 return sum;

	310 }

	311

	312 unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,

	313 int src_stride,

	314 int x_offset,

	315 int y_offset,

	316 const uint8_t *dst,

	317 int dst_stride,

	318 const uint8_t *sec,

	319 int sec_stride,

	320 int height,

	321 unsigned int *sse) {

	322 __m256i sec_reg;

	323 __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;

	324 __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;

	325 __m256i zero_reg;

	326 int i, sum;

	327 sum_reg = _mm256_set1_epi16(0);

	328 sse_reg = _mm256_set1_epi16(0);

	329 zero_reg = _mm256_set1_epi16(0);

	330

	331 // x_offset = 0 and y_offset = 0

	332 if (x_offset == 0) {

	333 if (y_offset == 0) {

	334 for (i = 0; i < height ; i++) {

	335 LOAD_SRC_DST

	336 sec_reg = _mm256_load_si256((__m256i const *) (sec));

	337 src_reg = _mm256_avg_epu8(src_reg, sec_reg);

	338 sec+= sec_stride;

	339 // expend each byte to 2 bytes

	340 MERGE_WITH_SRC(src_reg, zero_reg)

	341 CALC_SUM_SSE_INSIDE_LOOP

	342 src+= src_stride;

	343 dst+= dst_stride;

	344 }

	345 } else if (y_offset == 8) {

	346 __m256i src_next_reg;

	347 for (i = 0; i < height ; i++) {

	348 LOAD_SRC_DST

	349 AVG_NEXT_SRC(src_reg, src_stride)

	350 sec_reg = _mm256_load_si256((__m256i const *) (sec));

	351 src_reg = _mm256_avg_epu8(src_reg, sec_reg);

	352 sec+= sec_stride;

	353 // expend each byte to 2 bytes

	354 MERGE_WITH_SRC(src_reg, zero_reg)

	355 CALC_SUM_SSE_INSIDE_LOOP

	356 src+= src_stride;

	357 dst+= dst_stride;

	358 }

	359 // x_offset = 0 and y_offset = bilin interpolation

	360 } else {

	361 __m256i filter, pw8, src_next_reg;

	362

	363 y_offset <<= 5;

	364 filter = _mm256_load_si256((__m256i const *)

	365 (bilinear_filters_avx2 + y_offset));

	366 pw8 = _mm256_set1_epi16(8);

	367 for (i = 0; i < height ; i++) {

	368 LOAD_SRC_DST

	369 MERGE_NEXT_SRC(src_reg, src_stride)

	370 FILTER_SRC(filter)

	371 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

	372 sec_reg = _mm256_load_si256((__m256i const *) (sec));

	373 src_reg = _mm256_avg_epu8(src_reg, sec_reg);

	374 sec+= sec_stride;

	375 MERGE_WITH_SRC(src_reg, zero_reg)

	376 CALC_SUM_SSE_INSIDE_LOOP

	377 src+= src_stride;

	378 dst+= dst_stride;

	379 }

	380 }

	381 // x_offset = 8 and y_offset = 0

	382 } else if (x_offset == 8) {

	383 if (y_offset == 0) {

	384 __m256i src_next_reg;

	385 for (i = 0; i < height ; i++) {

	386 LOAD_SRC_DST

	387 AVG_NEXT_SRC(src_reg, 1)

	388 sec_reg = _mm256_load_si256((__m256i const *) (sec));

	389 src_reg = _mm256_avg_epu8(src_reg, sec_reg);

	390 sec+= sec_stride;

	391 // expand each byte to 2 bytes

	392 MERGE_WITH_SRC(src_reg, zero_reg)

	393 CALC_SUM_SSE_INSIDE_LOOP

	394 src+= src_stride;

	395 dst+= dst_stride;

	396 }

	397 // x_offset = 8 and y_offset = 8

	398 } else if (y_offset == 8) {

	399 __m256i src_next_reg, src_avg;

	400 // load source and another source starting from the next

	401 // following byte

	402 src_reg = _mm256_loadu_si256((__m256i const *) (src));

	403 AVG_NEXT_SRC(src_reg, 1)

	404 for (i = 0; i < height ; i++) {

	405 // save current source average

	406 src_avg = src_reg;

	407 src+= src_stride;

	408 LOAD_SRC_DST

	409 AVG_NEXT_SRC(src_reg, 1)

	410 // average between previous average to current average

	411 src_avg = _mm256_avg_epu8(src_avg, src_reg);

	412 sec_reg = _mm256_load_si256((__m256i const *) (sec));

	413 src_avg = _mm256_avg_epu8(src_avg, sec_reg);

	414 sec+= sec_stride;

	415 // expand each byte to 2 bytes

	416 MERGE_WITH_SRC(src_avg, zero_reg)

	417 CALC_SUM_SSE_INSIDE_LOOP

	418 dst+= dst_stride;

	419 }

	420 // x_offset = 8 and y_offset = bilin interpolation

	421 } else {

	422 __m256i filter, pw8, src_next_reg, src_avg;

	423 y_offset <<= 5;

	424 filter = _mm256_load_si256((__m256i const *)

	425 (bilinear_filters_avx2 + y_offset));

	426 pw8 = _mm256_set1_epi16(8);

	427 // load source and another source starting from the next

	428 // following byte

	429 src_reg = _mm256_loadu_si256((__m256i const *) (src));

	430 AVG_NEXT_SRC(src_reg, 1)

	431 for (i = 0; i < height ; i++) {

	432 // save current source average

	433 src_avg = src_reg;

	434 src+= src_stride;

	435 LOAD_SRC_DST

	436 AVG_NEXT_SRC(src_reg, 1)

	437 MERGE_WITH_SRC(src_avg, src_reg)

	438 FILTER_SRC(filter)

	439 src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

	440 sec_reg = _mm256_load_si256((__m256i const *) (sec));

	441 src_avg = _mm256_avg_epu8(src_avg, sec_reg);

	442 // expand each byte to 2 bytes

	443 MERGE_WITH_SRC(src_avg, zero_reg)

	444 sec+= sec_stride;

	445 CALC_SUM_SSE_INSIDE_LOOP

	446 dst+= dst_stride;

	447 }

	448 }

	449 // x_offset = bilin interpolation and y_offset = 0

	450 } else {

	451 if (y_offset == 0) {

	452 __m256i filter, pw8, src_next_reg;

	453 x_offset <<= 5;

	454 filter = _mm256_load_si256((__m256i const *)

	455 (bilinear_filters_avx2 + x_offset));

	456 pw8 = _mm256_set1_epi16(8);

	457 for (i = 0; i < height ; i++) {

	458 LOAD_SRC_DST

	459 MERGE_NEXT_SRC(src_reg, 1)

	460 FILTER_SRC(filter)

	461 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

	462 sec_reg = _mm256_load_si256((__m256i const *) (sec));

	463 src_reg = _mm256_avg_epu8(src_reg, sec_reg);

	464 MERGE_WITH_SRC(src_reg, zero_reg)

	465 sec+= sec_stride;

	466 CALC_SUM_SSE_INSIDE_LOOP

	467 src+= src_stride;

	468 dst+= dst_stride;

	469 }

	470 // x_offset = bilin interpolation and y_offset = 8

	471 } else if (y_offset == 8) {

	472 __m256i filter, pw8, src_next_reg, src_pack;

	473 x_offset <<= 5;

	474 filter = _mm256_load_si256((__m256i const *)

	475 (bilinear_filters_avx2 + x_offset));

	476 pw8 = _mm256_set1_epi16(8);

	477 src_reg = _mm256_loadu_si256((__m256i const *) (src));

	478 MERGE_NEXT_SRC(src_reg, 1)

	479 FILTER_SRC(filter)

	480 // convert each 16 bit to 8 bit to each low and high lane source

	481 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

	482 for (i = 0; i < height ; i++) {

	483 src+= src_stride;

	484 LOAD_SRC_DST

	485 MERGE_NEXT_SRC(src_reg, 1)

	486 FILTER_SRC(filter)

	487 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

	488 // average between previous pack to the current

	489 src_pack = _mm256_avg_epu8(src_pack, src_reg);

	490 sec_reg = _mm256_load_si256((__m256i const *) (sec));

	491 src_pack = _mm256_avg_epu8(src_pack, sec_reg);

	492 sec+= sec_stride;

	493 MERGE_WITH_SRC(src_pack, zero_reg)

	494 src_pack = src_reg;

	495 CALC_SUM_SSE_INSIDE_LOOP

	496 dst+= dst_stride;

	497 }

	498 // x_offset = bilin interpolation and y_offset = bilin interpolation

	499 } else {

	500 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;

	501 x_offset <<= 5;

	502 xfilter = _mm256_load_si256((__m256i const *)

	503 (bilinear_filters_avx2 + x_offset));

	504 y_offset <<= 5;

	505 yfilter = _mm256_load_si256((__m256i const *)

	506 (bilinear_filters_avx2 + y_offset));

	507 pw8 = _mm256_set1_epi16(8);

	508 // load source and another source starting from the next

	509 // following byte

	510 src_reg = _mm256_loadu_si256((__m256i const *) (src));

	511 MERGE_NEXT_SRC(src_reg, 1)

	512

	513 FILTER_SRC(xfilter)

	514 // convert each 16 bit to 8 bit to each low and high lane source

	515 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

	516 for (i = 0; i < height ; i++) {

	517 src+= src_stride;

	518 LOAD_SRC_DST

	519 MERGE_NEXT_SRC(src_reg, 1)

	520 FILTER_SRC(xfilter)

	521 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

	522 // merge previous pack to current pack source

	523 MERGE_WITH_SRC(src_pack, src_reg)

	524 // filter the source

	525 FILTER_SRC(yfilter)

	526 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);

	527 sec_reg = _mm256_load_si256((__m256i const *) (sec));

	528 src_pack = _mm256_avg_epu8(src_pack, sec_reg);

	529 MERGE_WITH_SRC(src_pack, zero_reg)

	530 src_pack = src_reg;

	531 sec+= sec_stride;

	532 CALC_SUM_SSE_INSIDE_LOOP

	533 dst+= dst_stride;

	534 }

	535 }

	536 }

	537 CALC_SUM_AND_SSE

	538 return sum;

	539 }

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c ('k') | source/libvpx/vp9/encoder/x86/vp9_variance_avx2.c » ('j') | no next file with comments »