simd/jdsample-altivec.c - Issue 1934113002: Update libjpeg_turbo to 1.4.90 from https://github.com/libjpeg-turbo/

Side by Side Diff: simd/jdsample-altivec.c

Issue 1934113002: Update libjpeg_turbo to 1.4.90 from https://github.com/libjpeg-turbo/ (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master

Patch Set: Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 /*

	2 * AltiVec optimizations for libjpeg-turbo

	3 *

	4 * Copyright (C) 2015, D. R. Commander.

	5 * All rights reserved.

	6 * This software is provided 'as-is', without any express or implied

	7 * warranty. In no event will the authors be held liable for any damages

	8 * arising from the use of this software.

	9 *

	10 * Permission is granted to anyone to use this software for any purpose,

	11 * including commercial applications, and to alter it and redistribute it

	12 * freely, subject to the following restrictions:

	13 *

	14 * 1. The origin of this software must not be misrepresented; you must not

	15 * claim that you wrote the original software. If you use this software

	16 * in a product, an acknowledgment in the product documentation would be

	17 * appreciated but is not required.

	18 * 2. Altered source versions must be plainly marked as such, and must not be

	19 * misrepresented as being the original software.

	20 * 3. This notice may not be removed or altered from any source distribution.

	21 */

	22

	23 /* CHROMA UPSAMPLING */

	24

	25 #include "jsimd_altivec.h"

	26

	27

	28 void

	29 jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor,

	30 JDIMENSION downsampled_width,

	31 JSAMPARRAY input_data,

	32 JSAMPARRAY *output_data_ptr)

	33 {

	34 JSAMPARRAY output_data = *output_data_ptr;

	35 JSAMPROW inptr, outptr;

	36 int inrow, incol;

	37

	38 __vector unsigned char this0, last0, p_last0, next0 = {0}, p_next0,

	39 out;

	40 __vector short this0e, this0o, this0l, this0h, last0l, last0h,

	41 next0l, next0h, outle, outhe, outlo, outho;

	42

	43 /* Constants */

	44 __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) },

	45 last_index_col0 = {0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14},

	46 last_index = {15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30},

	47 next_index = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},

	48 next_index_lastcol = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,15},

	49 #if __BIG_ENDIAN__

	50 merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};

	51 #else

	52 merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};

	53 #endif

	54 __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) };

	55

	56 for (inrow = 0; inrow < max_v_samp_factor; inrow++) {

	57 inptr = input_data[inrow];

	58 outptr = output_data[inrow];

	59

	60 if (downsampled_width & 15)

	61 inptr[downsampled_width] = inptr[downsampled_width - 1];

	62

	63 this0 = vec_ld(0, inptr);

	64 p_last0 = vec_perm(this0, this0, last_index_col0);

	65 last0 = this0;

	66

	67 for (incol = downsampled_width; incol > 0;

	68 incol -= 16, inptr += 16, outptr += 32) {

	69

	70 if (downsampled_width - incol > 0) {

	71 p_last0 = vec_perm(last0, this0, last_index);

	72 last0 = this0;

	73 }

	74

	75 if (incol <= 16)

	76 p_next0 = vec_perm(this0, this0, next_index_lastcol);

	77 else {

	78 next0 = vec_ld(16, inptr);

	79 p_next0 = vec_perm(this0, next0, next_index);

	80 }

	81

	82 this0e = (__vector short)vec_mule(this0, pb_three);

	83 this0o = (__vector short)vec_mulo(this0, pb_three);

	84 this0l = vec_mergeh(this0e, this0o);

	85 this0h = vec_mergel(this0e, this0o);

	86

	87 last0l = (__vector short)VEC_UNPACKHU(p_last0);

	88 last0h = (__vector short)VEC_UNPACKLU(p_last0);

	89 last0l = vec_add(last0l, pw_one);

	90

	91 next0l = (__vector short)VEC_UNPACKHU(p_next0);

	92 next0h = (__vector short)VEC_UNPACKLU(p_next0);

	93 next0l = vec_add(next0l, pw_two);

	94

	95 outle = vec_add(this0l, last0l);

	96 outlo = vec_add(this0l, next0l);

	97 outle = vec_sr(outle, (__vector unsigned short)pw_two);

	98 outlo = vec_sr(outlo, (__vector unsigned short)pw_two);

	99

	100 out = vec_perm((__vector unsigned char)outle,

	101 (__vector unsigned char)outlo, merge_pack_index);

	102 vec_st(out, 0, outptr);

	103

	104 if (incol > 8) {

	105 last0h = vec_add(last0h, pw_one);

	106 next0h = vec_add(next0h, pw_two);

	107

	108 outhe = vec_add(this0h, last0h);

	109 outho = vec_add(this0h, next0h);

	110 outhe = vec_sr(outhe, (__vector unsigned short)pw_two);

	111 outho = vec_sr(outho, (__vector unsigned short)pw_two);

	112

	113 out = vec_perm((__vector unsigned char)outhe,

	114 (__vector unsigned char)outho, merge_pack_index);

	115 vec_st(out, 16, outptr);

	116 }

	117

	118 this0 = next0;

	119 }

	120 }

	121 }

	122

	123

	124 void

	125 jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,

	126 JDIMENSION downsampled_width,

	127 JSAMPARRAY input_data,

	128 JSAMPARRAY *output_data_ptr)

	129 {

	130 JSAMPARRAY output_data = *output_data_ptr;

	131 JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;

	132 int inrow, outrow, incol;

	133

	134 __vector unsigned char this_1, this0, this1, out;

	135 __vector short this_1l, this_1h, this0l, this0h, this1l, this1h,

	136 lastcolsum_1h, lastcolsum1h,

	137 p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,

	138 thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,

	139 nextcolsum_1l = {0}, nextcolsum_1h = {0},

	140 nextcolsum1l = {0}, nextcolsum1h = {0},

	141 p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,

	142 tmpl, tmph, outle, outhe, outlo, outho;

	143

	144 /* Constants */

	145 __vector unsigned char pb_zero = { __16X(0) },

	146 last_index_col0 = {0,1,0,1,2,3,4,5,6,7,8,9,10,11,12,13},

	147 last_index={14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29},

	148 next_index = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17},

	149 next_index_lastcol = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15},

	150 #if __BIG_ENDIAN__

	151 merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};

	152 #else

	153 merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};

	154 #endif

	155 __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) },

	156 pw_seven = { __8X(7) }, pw_eight = { __8X(8) };

	157 __vector unsigned short pw_four = { __8X(4) };

	158

	159 for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {

	160

	161 inptr_1 = input_data[inrow - 1];

	162 inptr0 = input_data[inrow];

	163 inptr1 = input_data[inrow + 1];

	164 outptr0 = output_data[outrow++];

	165 outptr1 = output_data[outrow++];

	166

	167 if (downsampled_width & 15) {

	168 inptr_1[downsampled_width] = inptr_1[downsampled_width - 1];

	169 inptr0[downsampled_width] = inptr0[downsampled_width - 1];

	170 inptr1[downsampled_width] = inptr1[downsampled_width - 1];

	171 }

	172

	173 this0 = vec_ld(0, inptr0);

	174 this0l = (__vector short)VEC_UNPACKHU(this0);

	175 this0h = (__vector short)VEC_UNPACKLU(this0);

	176 this0l = vec_mladd(this0l, pw_three, pw_zero);

	177 this0h = vec_mladd(this0h, pw_three, pw_zero);

	178

	179 this_1 = vec_ld(0, inptr_1);

	180 this_1l = (__vector short)VEC_UNPACKHU(this_1);

	181 this_1h = (__vector short)VEC_UNPACKLU(this_1);

	182 thiscolsum_1l = vec_add(this0l, this_1l);

	183 thiscolsum_1h = vec_add(this0h, this_1h);

	184 lastcolsum_1h = thiscolsum_1h;

	185 p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0);

	186 p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);

	187

	188 this1 = vec_ld(0, inptr1);

	189 this1l = (__vector short)VEC_UNPACKHU(this1);

	190 this1h = (__vector short)VEC_UNPACKLU(this1);

	191 thiscolsum1l = vec_add(this0l, this1l);

	192 thiscolsum1h = vec_add(this0h, this1h);

	193 lastcolsum1h = thiscolsum1h;

	194 p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0);

	195 p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);

	196

	197 for (incol = downsampled_width; incol > 0;

	198 incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16,

	199 outptr0 += 32, outptr1 += 32) {

	200

	201 if (downsampled_width - incol > 0) {

	202 p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index);

	203 p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);

	204 p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index);

	205 p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);

	206 lastcolsum_1h = thiscolsum_1h; lastcolsum1h = thiscolsum1h;

	207 }

	208

	209 if (incol <= 16) {

	210 p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);

	211 p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h,

	212 next_index_lastcol);

	213 p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);

	214 p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h,

	215 next_index_lastcol);

	216 } else {

	217 this0 = vec_ld(16, inptr0);

	218 this0l = (__vector short)VEC_UNPACKHU(this0);

	219 this0h = (__vector short)VEC_UNPACKLU(this0);

	220 this0l = vec_mladd(this0l, pw_three, pw_zero);

	221 this0h = vec_mladd(this0h, pw_three, pw_zero);

	222

	223 this_1 = vec_ld(16, inptr_1);

	224 this_1l = (__vector short)VEC_UNPACKHU(this_1);

	225 this_1h = (__vector short)VEC_UNPACKLU(this_1);

	226 nextcolsum_1l = vec_add(this0l, this_1l);

	227 nextcolsum_1h = vec_add(this0h, this_1h);

	228 p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);

	229 p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index);

	230

	231 this1 = vec_ld(16, inptr1);

	232 this1l = (__vector short)VEC_UNPACKHU(this1);

	233 this1h = (__vector short)VEC_UNPACKLU(this1);

	234 nextcolsum1l = vec_add(this0l, this1l);

	235 nextcolsum1h = vec_add(this0h, this1h);

	236 p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);

	237 p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index);

	238 }

	239

	240 /* Process the upper row */

	241

	242 tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero);

	243 outle = vec_add(tmpl, p_lastcolsum_1l);

	244 outle = vec_add(outle, pw_eight);

	245 outle = vec_sr(outle, pw_four);

	246

	247 outlo = vec_add(tmpl, p_nextcolsum_1l);

	248 outlo = vec_add(outlo, pw_seven);

	249 outlo = vec_sr(outlo, pw_four);

	250

	251 out = vec_perm((__vector unsigned char)outle,

	252 (__vector unsigned char)outlo, merge_pack_index);

	253 vec_st(out, 0, outptr0);

	254

	255 if (incol > 8) {

	256 tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero);

	257 outhe = vec_add(tmph, p_lastcolsum_1h);

	258 outhe = vec_add(outhe, pw_eight);

	259 outhe = vec_sr(outhe, pw_four);

	260

	261 outho = vec_add(tmph, p_nextcolsum_1h);

	262 outho = vec_add(outho, pw_seven);

	263 outho = vec_sr(outho, pw_four);

	264

	265 out = vec_perm((__vector unsigned char)outhe,

	266 (__vector unsigned char)outho, merge_pack_index);

	267 vec_st(out, 16, outptr0);

	268 }

	269

	270 /* Process the lower row */

	271

	272 tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero);

	273 outle = vec_add(tmpl, p_lastcolsum1l);

	274 outle = vec_add(outle, pw_eight);

	275 outle = vec_sr(outle, pw_four);

	276

	277 outlo = vec_add(tmpl, p_nextcolsum1l);

	278 outlo = vec_add(outlo, pw_seven);

	279 outlo = vec_sr(outlo, pw_four);

	280

	281 out = vec_perm((__vector unsigned char)outle,

	282 (__vector unsigned char)outlo, merge_pack_index);

	283 vec_st(out, 0, outptr1);

	284

	285 if (incol > 8) {

	286 tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero);

	287 outhe = vec_add(tmph, p_lastcolsum1h);

	288 outhe = vec_add(outhe, pw_eight);

	289 outhe = vec_sr(outhe, pw_four);

	290

	291 outho = vec_add(tmph, p_nextcolsum1h);

	292 outho = vec_add(outho, pw_seven);

	293 outho = vec_sr(outho, pw_four);

	294

	295 out = vec_perm((__vector unsigned char)outhe,

	296 (__vector unsigned char)outho, merge_pack_index);

	297 vec_st(out, 16, outptr1);

	298 }

	299

	300 thiscolsum_1l = nextcolsum_1l; thiscolsum_1h = nextcolsum_1h;

	301 thiscolsum1l = nextcolsum1l; thiscolsum1h = nextcolsum1h;

	302 }

	303 }

	304 }

	305

	306

	307 /* These are rarely used (mainly just for decompressing YCCK images) */

	308

	309 void

	310 jsimd_h2v1_upsample_altivec (int max_v_samp_factor,

	311 JDIMENSION output_width,

	312 JSAMPARRAY input_data,

	313 JSAMPARRAY *output_data_ptr)

	314 {

	315 JSAMPARRAY output_data = *output_data_ptr;

	316 JSAMPROW inptr, outptr;

	317 int inrow, incol;

	318

	319 __vector unsigned char in, inl, inh;

	320

	321 for (inrow = 0; inrow < max_v_samp_factor; inrow++) {

	322 inptr = input_data[inrow];

	323 outptr = output_data[inrow];

	324

	325 for (incol = (output_width + 31) & (~31); incol > 0;

	326 incol -= 64, inptr += 32, outptr += 64) {

	327

	328 in = vec_ld(0, inptr);

	329 inl = vec_mergeh(in, in);

	330 inh = vec_mergel(in, in);

	331

	332 vec_st(inl, 0, outptr);

	333 vec_st(inh, 16, outptr);

	334

	335 if (incol > 32) {

	336 in = vec_ld(16, inptr);

	337 inl = vec_mergeh(in, in);

	338 inh = vec_mergel(in, in);

	339

	340 vec_st(inl, 32, outptr);

	341 vec_st(inh, 48, outptr);

	342 }

	343 }

	344 }

	345 }

	346

	347

	348 void

	349 jsimd_h2v2_upsample_altivec (int max_v_samp_factor,

	350 JDIMENSION output_width,

	351 JSAMPARRAY input_data,

	352 JSAMPARRAY *output_data_ptr)

	353 {

	354 JSAMPARRAY output_data = *output_data_ptr;

	355 JSAMPROW inptr, outptr0, outptr1;

	356 int inrow, outrow, incol;

	357

	358 __vector unsigned char in, inl, inh;

	359

	360 for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {

	361

	362 inptr = input_data[inrow];

	363 outptr0 = output_data[outrow++];

	364 outptr1 = output_data[outrow++];

	365

	366 for (incol = (output_width + 31) & (~31); incol > 0;

	367 incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) {

	368

	369 in = vec_ld(0, inptr);

	370 inl = vec_mergeh(in, in);

	371 inh = vec_mergel(in, in);

	372

	373 vec_st(inl, 0, outptr0);

	374 vec_st(inl, 0, outptr1);

	375

	376 vec_st(inh, 16, outptr0);

	377 vec_st(inh, 16, outptr1);

	378

	379 if (incol > 32) {

	380 in = vec_ld(16, inptr);

	381 inl = vec_mergeh(in, in);

	382 inh = vec_mergel(in, in);

	383

	384 vec_st(inl, 32, outptr0);

	385 vec_st(inl, 32, outptr1);

	386

	387 vec_st(inh, 48, outptr0);

	388 vec_st(inh, 48, outptr1);

	389 }

	390 }

	391 }

	392 }

OLD	NEW

« simd/jccolext-sse2-64.asm ('K') | « simd/jdmrgext-sse2-64.asm ('k') | simd/jdsample-mmx.asm » ('j') | no next file with comments »