source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c - Issue 168343002: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c

Issue 168343002: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: libvpx: Pull from upstream Created 6 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 /*

	2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.

	3 *

	4 * Use of this source code is governed by a BSD-style license

	5 * that can be found in the LICENSE file in the root of the source

	6 * tree. An additional intellectual property rights grant can be found

	7 * in the file PATENTS. All contributing project authors may

	8 * be found in the AUTHORS file in the root of the source tree.

	9 */

	10

	11 #include <immintrin.h>

	12 #include "vpx_ports/mem.h"

	13

	14 // filters for 16_h8 and 16_v8

	15 DECLARE_ALIGNED(32, const unsigned char, filt1_global_avx2[32])= {

	16 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,

	17 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8};

	18

	19 DECLARE_ALIGNED(32, const unsigned char, filt2_global_avx2[32])= {

	20 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,

	21 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10};

	22

	23 DECLARE_ALIGNED(32, const unsigned char, filt3_global_avx2[32])= {

	24 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,

	25 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12};

	26

	27 DECLARE_ALIGNED(32, const unsigned char, filt4_global_avx2[32])= {

	28 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,

	29 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14};

	30

	31

	32 void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,

	33 unsigned int src_pixels_per_line,

	34 unsigned char *output_ptr,

	35 unsigned int output_pitch,

	36 unsigned int output_height,

	37 int16_t *filter) {

	38 __m128i filtersReg;

	39 __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;

	40 __m256i firstFilters, secondFilters, thirdFilters, forthFilters;

	41 __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;

	42 __m256i srcReg32b1, srcReg32b2, filtersReg32;

	43 unsigned int i;

	44 unsigned int src_stride, dst_stride;

	45

	46 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64

	47 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);

	48 filtersReg = _mm_loadu_si128((__m128i *)filter);

	49 // converting the 16 bit (short) to 8 bit (byte) and have the same data

	50 // in both lanes of 128 bit register.

	51 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);

	52 // have the same data in both lanes of a 256 bit register

	53 #if defined (__GNUC__)

	54 #if ( __GNUC__ < 4 \|\| (__GNUC__ == 4 && \

	55 (__GNUC_MINOR__ < 6 \|\| (__GNUC_MINOR__ == 6 && __GNUC_PATCHLEVEL__ > 0))))

	56 filtersReg32 = _mm_broadcastsi128_si256((__m128i const *)&filtersReg);

	57 #elif(__GNUC__ == 4 && (__GNUC_MINOR__ == 7 && __GNUC_PATCHLEVEL__ > 0))

	58 filtersReg32 = _mm_broadcastsi128_si256(filtersReg);

	59 #else

	60 filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);

	61 #endif

	62 #else

	63 filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);

	64 #endif

	65

	66 // duplicate only the first 16 bits (first and second byte)

	67 // across 256 bit register

	68 firstFilters = _mm256_shuffle_epi8(filtersReg32,

	69 _mm256_set1_epi16(0x100u));

	70 // duplicate only the second 16 bits (third and forth byte)

	71 // across 256 bit register

	72 secondFilters = _mm256_shuffle_epi8(filtersReg32,

	73 _mm256_set1_epi16(0x302u));

	74 // duplicate only the third 16 bits (fifth and sixth byte)

	75 // across 256 bit register

	76 thirdFilters = _mm256_shuffle_epi8(filtersReg32,

	77 _mm256_set1_epi16(0x504u));

	78 // duplicate only the forth 16 bits (seventh and eighth byte)

	79 // across 256 bit register

	80 forthFilters = _mm256_shuffle_epi8(filtersReg32,

	81 _mm256_set1_epi16(0x706u));

	82

	83 filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);

	84 filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);

	85 filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);

	86 filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);

	87

	88 // multiple the size of the source and destination stride by two

	89 src_stride = src_pixels_per_line << 1;

	90 dst_stride = output_pitch << 1;

	91 for (i = output_height; i > 1; i-=2) {

	92 // load the 2 strides of source

	93 srcReg32b1 = _mm256_castsi128_si256(

	94 _mm_loadu_si128((__m128i *)(src_ptr-3)));

	95 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,

	96 _mm_loadu_si128((__m128i *)

	97 (src_ptr+src_pixels_per_line-3)), 1);

	98

	99 // filter the source buffer

	100 srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);

	101 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);

	102

	103 // multiply 2 adjacent elements with the filter and add the result

	104 srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);

	105 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);

	106

	107 // add and saturate the results together

	108 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);

	109

	110 // filter the source buffer

	111 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);

	112 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);

	113

	114 // multiply 2 adjacent elements with the filter and add the result

	115 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters);

	116 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);

	117

	118 // add and saturate the results together

	119 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,

	120 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));

	121

	122 // reading 2 strides of the next 16 bytes

	123 // (part of it was being read by earlier read)

	124 srcReg32b2 = _mm256_castsi128_si256(

	125 _mm_loadu_si128((__m128i *)(src_ptr+5)));

	126 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,

	127 _mm_loadu_si128((__m128i *)

	128 (src_ptr+src_pixels_per_line+5)), 1);

	129

	130 // add and saturate the results together

	131 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,

	132 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));

	133

	134 // filter the source buffer

	135 srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);

	136 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);

	137

	138 // multiply 2 adjacent elements with the filter and add the result

	139 srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);

	140 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);

	141

	142 // add and saturate the results together

	143 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);

	144

	145 // filter the source buffer

	146 srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt4Reg);

	147 srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg);

	148

	149 // multiply 2 adjacent elements with the filter and add the result

	150 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters);

	151 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);

	152

	153 // add and saturate the results together

	154 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,

	155 _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));

	156 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,

	157 _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));

	158

	159

	160 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);

	161

	162 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);

	163

	164 // shift by 7 bit each 16 bit

	165 srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);

	166 srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);

	167

	168 // shrink to 8 bit each 16 bits, the first lane contain the first

	169 // convolve result and the second lane contain the second convolve

	170 // result

	171 srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1,

	172 srcRegFilt32b2_1);

	173

	174 src_ptr+=src_stride;

	175

	176 // save 16 bytes

	177 _mm_store_si128((__m128i*)output_ptr,

	178 _mm256_castsi256_si128(srcRegFilt32b1_1));

	179

	180 // save the next 16 bits

	181 _mm_store_si128((__m128i*)(output_ptr+output_pitch),

	182 _mm256_extractf128_si256(srcRegFilt32b1_1, 1));

	183 output_ptr+=dst_stride;

	184 }

	185

	186 // if the number of strides is odd.

	187 // process only 16 bytes

	188 if (i > 0) {

	189 __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;

	190 __m128i srcRegFilt2, srcRegFilt3;

	191

	192 srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));

	193

	194 // filter the source buffer

	195 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,

	196 _mm256_castsi256_si128(filt1Reg));

	197 srcRegFilt2 = _mm_shuffle_epi8(srcReg1,

	198 _mm256_castsi256_si128(filt2Reg));

	199

	200 // multiply 2 adjacent elements with the filter and add the result

	201 srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,

	202 _mm256_castsi256_si128(firstFilters));

	203 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,

	204 _mm256_castsi256_si128(secondFilters));

	205

	206 // add and saturate the results together

	207 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);

	208

	209 // filter the source buffer

	210 srcRegFilt3= _mm_shuffle_epi8(srcReg1,

	211 _mm256_castsi256_si128(filt4Reg));

	212 srcRegFilt2= _mm_shuffle_epi8(srcReg1,

	213 _mm256_castsi256_si128(filt3Reg));

	214

	215 // multiply 2 adjacent elements with the filter and add the result

	216 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,

	217 _mm256_castsi256_si128(forthFilters));

	218 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,

	219 _mm256_castsi256_si128(thirdFilters));

	220

	221 // add and saturate the results together

	222 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,

	223 _mm_min_epi16(srcRegFilt3, srcRegFilt2));

	224

	225 // reading the next 16 bytes

	226 // (part of it was being read by earlier read)

	227 srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));

	228

	229 // add and saturate the results together

	230 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,

	231 _mm_max_epi16(srcRegFilt3, srcRegFilt2));

	232

	233 // filter the source buffer

	234 srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,

	235 _mm256_castsi256_si128(filt1Reg));

	236 srcRegFilt2 = _mm_shuffle_epi8(srcReg2,

	237 _mm256_castsi256_si128(filt2Reg));

	238

	239 // multiply 2 adjacent elements with the filter and add the result

	240 srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1,

	241 _mm256_castsi256_si128(firstFilters));

	242 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,

	243 _mm256_castsi256_si128(secondFilters));

	244

	245 // add and saturate the results together

	246 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);

	247

	248 // filter the source buffer

	249 srcRegFilt3 = _mm_shuffle_epi8(srcReg2,

	250 _mm256_castsi256_si128(filt4Reg));

	251 srcRegFilt2 = _mm_shuffle_epi8(srcReg2,

	252 _mm256_castsi256_si128(filt3Reg));

	253

	254 // multiply 2 adjacent elements with the filter and add the result

	255 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,

	256 _mm256_castsi256_si128(forthFilters));

	257 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,

	258 _mm256_castsi256_si128(thirdFilters));

	259

	260 // add and saturate the results together

	261 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,

	262 _mm_min_epi16(srcRegFilt3, srcRegFilt2));

	263 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,

	264 _mm_max_epi16(srcRegFilt3, srcRegFilt2));

	265

	266

	267 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,

	268 _mm256_castsi256_si128(addFilterReg64));

	269

	270 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,

	271 _mm256_castsi256_si128(addFilterReg64));

	272

	273 // shift by 7 bit each 16 bit

	274 srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);

	275 srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);

	276

	277 // shrink to 8 bit each 16 bits, the first lane contain the first

	278 // convolve result and the second lane contain the second convolve

	279 // result

	280 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);

	281

	282 // save 16 bytes

	283 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);

	284 }

	285 }

	286

	287 void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr,

	288 unsigned int src_pitch,

	289 unsigned char *output_ptr,

	290 unsigned int out_pitch,

	291 unsigned int output_height,

	292 int16_t *filter) {

	293 __m128i filtersReg;

	294 __m256i addFilterReg64;

	295 __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;

	296 __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;

	297 __m256i srcReg32b11, srcReg32b12, srcReg32b13, filtersReg32;

	298 __m256i firstFilters, secondFilters, thirdFilters, forthFilters;

	299 unsigned int i;

	300 unsigned int src_stride, dst_stride;

	301

	302 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64

	303 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);

	304 filtersReg = _mm_loadu_si128((__m128i *)filter);

	305 // converting the 16 bit (short) to 8 bit (byte) and have the

	306 // same data in both lanes of 128 bit register.

	307 filtersReg =_mm_packs_epi16(filtersReg, filtersReg);

	308 // have the same data in both lanes of a 256 bit register

	309 #if defined (__GNUC__)

	310 #if ( __GNUC__ < 4 \|\| (__GNUC__ == 4 && \

	311 (__GNUC_MINOR__ < 6 \|\| (__GNUC_MINOR__ == 6 && __GNUC_PATCHLEVEL__ > 0))))

	312 filtersReg32 = _mm_broadcastsi128_si256((__m128i const *)&filtersReg);

	313 #elif(__GNUC__ == 4 && (__GNUC_MINOR__ == 7 && __GNUC_PATCHLEVEL__ > 0))

	314 filtersReg32 = _mm_broadcastsi128_si256(filtersReg);

	315 #else

	316 filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);

	317 #endif

	318 #else

	319 filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);

	320 #endif

	321

	322 // duplicate only the first 16 bits (first and second byte)

	323 // across 256 bit register

	324 firstFilters = _mm256_shuffle_epi8(filtersReg32,

	325 _mm256_set1_epi16(0x100u));

	326 // duplicate only the second 16 bits (third and forth byte)

	327 // across 256 bit register

	328 secondFilters = _mm256_shuffle_epi8(filtersReg32,

	329 _mm256_set1_epi16(0x302u));

	330 // duplicate only the third 16 bits (fifth and sixth byte)

	331 // across 256 bit register

	332 thirdFilters = _mm256_shuffle_epi8(filtersReg32,

	333 _mm256_set1_epi16(0x504u));

	334 // duplicate only the forth 16 bits (seventh and eighth byte)

	335 // across 256 bit register

	336 forthFilters = _mm256_shuffle_epi8(filtersReg32,

	337 _mm256_set1_epi16(0x706u));

	338

	339 // multiple the size of the source and destination stride by two

	340 src_stride = src_pitch << 1;

	341 dst_stride = out_pitch << 1;

	342

	343 // load 16 bytes 7 times in stride of src_pitch

	344 srcReg32b1 = _mm256_castsi128_si256(

	345 _mm_loadu_si128((__m128i *)(src_ptr)));

	346 srcReg32b2 = _mm256_castsi128_si256(

	347 _mm_loadu_si128((__m128i *)(src_ptr+src_pitch)));

	348 srcReg32b3 = _mm256_castsi128_si256(

	349 _mm_loadu_si128((__m128i )(src_ptr+src_pitch2)));

	350 srcReg32b4 = _mm256_castsi128_si256(

	351 _mm_loadu_si128((__m128i )(src_ptr+src_pitch3)));

	352 srcReg32b5 = _mm256_castsi128_si256(

	353 _mm_loadu_si128((__m128i )(src_ptr+src_pitch4)));

	354 srcReg32b6 = _mm256_castsi128_si256(

	355 _mm_loadu_si128((__m128i )(src_ptr+src_pitch5)));

	356 srcReg32b7 = _mm256_castsi128_si256(

	357 _mm_loadu_si128((__m128i )(src_ptr+src_pitch6)));

	358

	359 // have each consecutive loads on the same 256 register

	360 srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,

	361 _mm256_castsi256_si128(srcReg32b2), 1);

	362 srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,

	363 _mm256_castsi256_si128(srcReg32b3), 1);

	364 srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,

	365 _mm256_castsi256_si128(srcReg32b4), 1);

	366 srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,

	367 _mm256_castsi256_si128(srcReg32b5), 1);

	368 srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,

	369 _mm256_castsi256_si128(srcReg32b6), 1);

	370 srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,

	371 _mm256_castsi256_si128(srcReg32b7), 1);

	372

	373 // merge every two consecutive registers except the last one

	374 srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);

	375 srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);

	376

	377 // save

	378 srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);

	379

	380 // save

	381 srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);

	382

	383 // save

	384 srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);

	385

	386 // save

	387 srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);

	388

	389

	390 for (i = output_height; i > 1; i-=2) {

	391 // load the last 2 loads of 16 bytes and have every two

	392 // consecutive loads in the same 256 bit register

	393 srcReg32b8 = _mm256_castsi128_si256(

	394 _mm_loadu_si128((__m128i )(src_ptr+src_pitch7)));

	395 srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,

	396 _mm256_castsi256_si128(srcReg32b8), 1);

	397 srcReg32b9 = _mm256_castsi128_si256(

	398 _mm_loadu_si128((__m128i )(src_ptr+src_pitch8)));

	399 srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,

	400 _mm256_castsi256_si128(srcReg32b9), 1);

	401

	402 // merge every two consecutive registers

	403 // save

	404 srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);

	405 srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);

	406

	407 // multiply 2 adjacent elements with the filter and add the result

	408 srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);

	409 srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);

	410 srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);

	411 srcReg32b8 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);

	412

	413 // add and saturate the results together

	414 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);

	415 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b8);

	416

	417

	418 // multiply 2 adjacent elements with the filter and add the result

	419 srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);

	420 srcReg32b6 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);

	421

	422 // multiply 2 adjacent elements with the filter and add the result

	423 srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);

	424 srcReg32b13 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);

	425

	426

	427 // add and saturate the results together

	428 srcReg32b10 = _mm256_adds_epi16(srcReg32b10,

	429 _mm256_min_epi16(srcReg32b8, srcReg32b12));

	430 srcReg32b1 = _mm256_adds_epi16(srcReg32b1,

	431 _mm256_min_epi16(srcReg32b6, srcReg32b13));

	432

	433 // add and saturate the results together

	434 srcReg32b10 = _mm256_adds_epi16(srcReg32b10,

	435 _mm256_max_epi16(srcReg32b8, srcReg32b12));

	436 srcReg32b1 = _mm256_adds_epi16(srcReg32b1,

	437 _mm256_max_epi16(srcReg32b6, srcReg32b13));

	438

	439

	440 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);

	441 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);

	442

	443 // shift by 7 bit each 16 bit

	444 srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);

	445 srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);

	446

	447 // shrink to 8 bit each 16 bits, the first lane contain the first

	448 // convolve result and the second lane contain the second convolve

	449 // result

	450 srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);

	451

	452 src_ptr+=src_stride;

	453

	454 // save 16 bytes

	455 _mm_store_si128((__m128i*)output_ptr,

	456 _mm256_castsi256_si128(srcReg32b1));

	457

	458 // save the next 16 bits

	459 _mm_store_si128((__m128i*)(output_ptr+out_pitch),

	460 _mm256_extractf128_si256(srcReg32b1, 1));

	461

	462 output_ptr+=dst_stride;

	463

	464 // save part of the registers for next strides

	465 srcReg32b10 = srcReg32b11;

	466 srcReg32b1 = srcReg32b3;

	467 srcReg32b11 = srcReg32b2;

	468 srcReg32b3 = srcReg32b5;

	469 srcReg32b2 = srcReg32b4;

	470 srcReg32b5 = srcReg32b7;

	471 srcReg32b7 = srcReg32b9;

	472 }

	473 if (i > 0) {

	474 __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;

	475 __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;

	476 // load the last 16 bytes

	477 srcRegFilt8 = _mm_loadu_si128((__m128i )(src_ptr+src_pitch7));

	478

	479 // merge the last 2 results together

	480 srcRegFilt4 = _mm_unpacklo_epi8(

	481 _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);

	482 srcRegFilt7 = _mm_unpackhi_epi8(

	483 _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);

	484

	485 // multiply 2 adjacent elements with the filter and add the result

	486 srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),

	487 _mm256_castsi256_si128(firstFilters));

	488 srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4,

	489 _mm256_castsi256_si128(forthFilters));

	490 srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),

	491 _mm256_castsi256_si128(firstFilters));

	492 srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7,

	493 _mm256_castsi256_si128(forthFilters));

	494

	495 // add and saturate the results together

	496 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);

	497 srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);

	498

	499

	500 // multiply 2 adjacent elements with the filter and add the result

	501 srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),

	502 _mm256_castsi256_si128(secondFilters));

	503 srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),

	504 _mm256_castsi256_si128(secondFilters));

	505

	506 // multiply 2 adjacent elements with the filter and add the result

	507 srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),

	508 _mm256_castsi256_si128(thirdFilters));

	509 srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),

	510 _mm256_castsi256_si128(thirdFilters));

	511

	512 // add and saturate the results together

	513 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,

	514 _mm_min_epi16(srcRegFilt4, srcRegFilt6));

	515 srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,

	516 _mm_min_epi16(srcRegFilt5, srcRegFilt7));

	517

	518 // add and saturate the results together

	519 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,

	520 _mm_max_epi16(srcRegFilt4, srcRegFilt6));

	521 srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,

	522 _mm_max_epi16(srcRegFilt5, srcRegFilt7));

	523

	524

	525 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,

	526 _mm256_castsi256_si128(addFilterReg64));

	527 srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,

	528 _mm256_castsi256_si128(addFilterReg64));

	529

	530 // shift by 7 bit each 16 bit

	531 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);

	532 srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);

	533

	534 // shrink to 8 bit each 16 bits, the first lane contain the first

	535 // convolve result and the second lane contain the second convolve

	536 // result

	537 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);

	538

	539 // save 16 bytes

	540 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);

	541 }

	542 }

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/common/x86/vp9_asm_stubs.c ('k') | source/libvpx/vp9/common/x86/vp9_subpixel_bilinear_sse2.asm » ('j') | no next file with comments »