simd/jidctfst-altivec.c - Issue 1934113002: Update libjpeg_turbo to 1.4.90 from https://github.com/libjpeg-turbo/

Side by Side Diff: simd/jidctfst-altivec.c

Issue 1934113002: Update libjpeg_turbo to 1.4.90 from https://github.com/libjpeg-turbo/ (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master

Patch Set: Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 /*

	2 * AltiVec optimizations for libjpeg-turbo

	3 *

	4 * Copyright (C) 2014-2015, D. R. Commander.

	5 * All rights reserved.

	6 * This software is provided 'as-is', without any express or implied

	7 * warranty. In no event will the authors be held liable for any damages

	8 * arising from the use of this software.

	9 *

	10 * Permission is granted to anyone to use this software for any purpose,

	11 * including commercial applications, and to alter it and redistribute it

	12 * freely, subject to the following restrictions:

	13 *

	14 * 1. The origin of this software must not be misrepresented; you must not

	15 * claim that you wrote the original software. If you use this software

	16 * in a product, an acknowledgment in the product documentation would be

	17 * appreciated but is not required.

	18 * 2. Altered source versions must be plainly marked as such, and must not be

	19 * misrepresented as being the original software.

	20 * 3. This notice may not be removed or altered from any source distribution.

	21 */

	22

	23 /* FAST INTEGER INVERSE DCT

	24 *

	25 * This is similar to the SSE2 implementation, except that we left-shift the

	26 * constants by 1 less bit (the -1 in CONST_SHIFT.) This is because

	27 * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:

	28 * the elements in arg3 + the most significant 17 bits of

	29 * (the elements in arg1 * the elements in arg2).

	30 */

	31

	32 #include "jsimd_altivec.h"

	33

	34

	35 #define F_1_082 277 /* FIX(1.082392200) */

	36 #define F_1_414 362 /* FIX(1.414213562) */

	37 #define F_1_847 473 /* FIX(1.847759065) */

	38 #define F_2_613 669 /* FIX(2.613125930) */

	39 #define F_1_613 (F_2_613 - 256) /* FIX(2.613125930) - FIX(1) */

	40

	41 #define CONST_BITS 8

	42 #define PASS1_BITS 2

	43 #define PRE_MULTIPLY_SCALE_BITS 2

	44 #define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)

	45

	46

	47 #define DO_IDCT(in) \

	48 { \

	49 /* Even part */ \

	50 \

	51 tmp10 = vec_add(in##0, in##4); \

	52 tmp11 = vec_sub(in##0, in##4); \

	53 tmp13 = vec_add(in##2, in##6); \

	54 \

	55 tmp12 = vec_sub(in##2, in##6); \

	56 tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \

	57 tmp12 = vec_madds(tmp12, pw_F1414, pw_zero); \

	58 tmp12 = vec_sub(tmp12, tmp13); \

	59 \

	60 tmp0 = vec_add(tmp10, tmp13); \

	61 tmp3 = vec_sub(tmp10, tmp13); \

	62 tmp1 = vec_add(tmp11, tmp12); \

	63 tmp2 = vec_sub(tmp11, tmp12); \

	64 \

	65 /* Odd part */ \

	66 \

	67 z13 = vec_add(in##5, in##3); \

	68 z10 = vec_sub(in##5, in##3); \

	69 z10s = vec_sl(z10, pre_multiply_scale_bits); \

	70 z11 = vec_add(in##1, in##7); \

	71 z12s = vec_sub(in##1, in##7); \

	72 z12s = vec_sl(z12s, pre_multiply_scale_bits); \

	73 \

	74 tmp11 = vec_sub(z11, z13); \

	75 tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \

	76 tmp11 = vec_madds(tmp11, pw_F1414, pw_zero); \

	77 \

	78 tmp7 = vec_add(z11, z13); \

	79 \

	80 /* To avoid overflow... \

	81 * \

	82 * (Original) \

	83 * tmp12 = -2.613125930 * z10 + z5; \

	84 * \

	85 * (This implementation) \

	86 * tmp12 = (-1.613125930 - 1) * z10 + z5; \

	87 * = -1.613125930 * z10 - z10 + z5; \

	88 */ \

	89 \

	90 z5 = vec_add(z10s, z12s); \

	91 z5 = vec_madds(z5, pw_F1847, pw_zero); \

	92 \

	93 tmp10 = vec_madds(z12s, pw_F1082, pw_zero); \

	94 tmp10 = vec_sub(tmp10, z5); \

	95 tmp12 = vec_madds(z10s, pw_MF1613, z5); \

	96 tmp12 = vec_sub(tmp12, z10); \

	97 \

	98 tmp6 = vec_sub(tmp12, tmp7); \

	99 tmp5 = vec_sub(tmp11, tmp6); \

	100 tmp4 = vec_add(tmp10, tmp5); \

	101 \

	102 out0 = vec_add(tmp0, tmp7); \

	103 out1 = vec_add(tmp1, tmp6); \

	104 out2 = vec_add(tmp2, tmp5); \

	105 out3 = vec_sub(tmp3, tmp4); \

	106 out4 = vec_add(tmp3, tmp4); \

	107 out5 = vec_sub(tmp2, tmp5); \

	108 out6 = vec_sub(tmp1, tmp6); \

	109 out7 = vec_sub(tmp0, tmp7); \

	110 }

	111

	112

	113 void

	114 jsimd_idct_ifast_altivec (void *dct_table_, JCOEFPTR coef_block,

	115 JSAMPARRAY output_buf, JDIMENSION output_col)

	116 {

	117 short dct_table = (short )dct_table_;

	118 int *outptr;

	119

	120 __vector short row0, row1, row2, row3, row4, row5, row6, row7,

	121 col0, col1, col2, col3, col4, col5, col6, col7,

	122 quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,

	123 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,

	124 z5, z10, z10s, z11, z12s, z13,

	125 out0, out1, out2, out3, out4, out5, out6, out7;

	126 __vector signed char outb;

	127

	128 /* Constants */

	129 __vector short pw_zero = { __8X(0) },

	130 pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) },

	131 pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) },

	132 pw_MF1613 = { __8X(-F_1_613 << CONST_SHIFT) },

	133 pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) };

	134 __vector unsigned short

	135 pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) },

	136 pass1_bits3 = { __8X(PASS1_BITS + 3) };

	137 __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };

	138

	139 /* Pass 1: process columns */

	140

	141 col0 = vec_ld(0, coef_block);

	142 col1 = vec_ld(16, coef_block);

	143 col2 = vec_ld(32, coef_block);

	144 col3 = vec_ld(48, coef_block);

	145 col4 = vec_ld(64, coef_block);

	146 col5 = vec_ld(80, coef_block);

	147 col6 = vec_ld(96, coef_block);

	148 col7 = vec_ld(112, coef_block);

	149

	150 tmp1 = vec_or(col1, col2);

	151 tmp2 = vec_or(col3, col4);

	152 tmp1 = vec_or(tmp1, tmp2);

	153 tmp3 = vec_or(col5, col6);

	154 tmp3 = vec_or(tmp3, col7);

	155 tmp1 = vec_or(tmp1, tmp3);

	156

	157 quant0 = vec_ld(0, dct_table);

	158 col0 = vec_mladd(col0, quant0, pw_zero);

	159

	160 if (vec_all_eq(tmp1, pw_zero)) {

	161 /* AC terms all zero */

	162

	163 row0 = vec_splat(col0, 0);

	164 row1 = vec_splat(col0, 1);

	165 row2 = vec_splat(col0, 2);

	166 row3 = vec_splat(col0, 3);

	167 row4 = vec_splat(col0, 4);

	168 row5 = vec_splat(col0, 5);

	169 row6 = vec_splat(col0, 6);

	170 row7 = vec_splat(col0, 7);

	171

	172 } else {

	173

	174 quant1 = vec_ld(16, dct_table);

	175 quant2 = vec_ld(32, dct_table);

	176 quant3 = vec_ld(48, dct_table);

	177 quant4 = vec_ld(64, dct_table);

	178 quant5 = vec_ld(80, dct_table);

	179 quant6 = vec_ld(96, dct_table);

	180 quant7 = vec_ld(112, dct_table);

	181

	182 col1 = vec_mladd(col1, quant1, pw_zero);

	183 col2 = vec_mladd(col2, quant2, pw_zero);

	184 col3 = vec_mladd(col3, quant3, pw_zero);

	185 col4 = vec_mladd(col4, quant4, pw_zero);

	186 col5 = vec_mladd(col5, quant5, pw_zero);

	187 col6 = vec_mladd(col6, quant6, pw_zero);

	188 col7 = vec_mladd(col7, quant7, pw_zero);

	189

	190 DO_IDCT(col);

	191

	192 TRANSPOSE(out, row);

	193 }

	194

	195 /* Pass 2: process rows */

	196

	197 DO_IDCT(row);

	198

	199 out0 = vec_sra(out0, pass1_bits3);

	200 out1 = vec_sra(out1, pass1_bits3);

	201 out2 = vec_sra(out2, pass1_bits3);

	202 out3 = vec_sra(out3, pass1_bits3);

	203 out4 = vec_sra(out4, pass1_bits3);

	204 out5 = vec_sra(out5, pass1_bits3);

	205 out6 = vec_sra(out6, pass1_bits3);

	206 out7 = vec_sra(out7, pass1_bits3);

	207

	208 TRANSPOSE(out, col);

	209

	210 outb = vec_packs(col0, col0);

	211 outb = vec_add(outb, pb_centerjsamp);

	212 outptr = (int *)(output_buf[0] + output_col);

	213 vec_ste((__vector int)outb, 0, outptr);

	214 vec_ste((__vector int)outb, 4, outptr);

	215

	216 outb = vec_packs(col1, col1);

	217 outb = vec_add(outb, pb_centerjsamp);

	218 outptr = (int *)(output_buf[1] + output_col);

	219 vec_ste((__vector int)outb, 0, outptr);

	220 vec_ste((__vector int)outb, 4, outptr);

	221

	222 outb = vec_packs(col2, col2);

	223 outb = vec_add(outb, pb_centerjsamp);

	224 outptr = (int *)(output_buf[2] + output_col);

	225 vec_ste((__vector int)outb, 0, outptr);

	226 vec_ste((__vector int)outb, 4, outptr);

	227

	228 outb = vec_packs(col3, col3);

	229 outb = vec_add(outb, pb_centerjsamp);

	230 outptr = (int *)(output_buf[3] + output_col);

	231 vec_ste((__vector int)outb, 0, outptr);

	232 vec_ste((__vector int)outb, 4, outptr);

	233

	234 outb = vec_packs(col4, col4);

	235 outb = vec_add(outb, pb_centerjsamp);

	236 outptr = (int *)(output_buf[4] + output_col);

	237 vec_ste((__vector int)outb, 0, outptr);

	238 vec_ste((__vector int)outb, 4, outptr);

	239

	240 outb = vec_packs(col5, col5);

	241 outb = vec_add(outb, pb_centerjsamp);

	242 outptr = (int *)(output_buf[5] + output_col);

	243 vec_ste((__vector int)outb, 0, outptr);

	244 vec_ste((__vector int)outb, 4, outptr);

	245

	246 outb = vec_packs(col6, col6);

	247 outb = vec_add(outb, pb_centerjsamp);

	248 outptr = (int *)(output_buf[6] + output_col);

	249 vec_ste((__vector int)outb, 0, outptr);

	250 vec_ste((__vector int)outb, 4, outptr);

	251

	252 outb = vec_packs(col7, col7);

	253 outb = vec_add(outb, pb_centerjsamp);

	254 outptr = (int *)(output_buf[7] + output_col);

	255 vec_ste((__vector int)outb, 0, outptr);

	256 vec_ste((__vector int)outb, 4, outptr);

	257 }

OLD	NEW

« simd/jccolext-sse2-64.asm ('K') | « simd/jidctflt-sse2-64.asm ('k') | simd/jidctfst-mmx.asm » ('j') | no next file with comments »