third_party/libpng/contrib/intel/filter_sse2_intrinsics.c - Issue 2021403002: Update libpng to 1.6.22

Side by Side Diff: third_party/libpng/contrib/intel/filter_sse2_intrinsics.c

Issue 2021403002: Update libpng to 1.6.22 (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Rearrange pnglibconf.h Created 4 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1

	2 /* filter_sse2_intrinsics.c - SSE2 optimized filter functions

	3 *

	4 * Copyright (c) 2016 Google, Inc.

	5 * Written by Mike Klein and Matt Sarett

	6 * Derived from arm/filter_neon_intrinsics.c, which was

	7 * Copyright (c) 2014,2016 Glenn Randers-Pehrson

	8 *

	9 * Last changed in libpng 1.6.22 [May 26, 2016]

	10 *

	11 * This code is released under the libpng license.

	12 * For conditions of distribution and use, see the disclaimer

	13 * and license in png.h

	14 */

	15

	16 #include "../../pngpriv.h"

	17

	18 #ifdef PNG_READ_SUPPORTED

	19

	20 #if PNG_INTEL_SSE_IMPLEMENTATION > 0

	21

	22 #include <immintrin.h>

	23

	24 /* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).

	25 * They're positioned like this:

	26 * prev: c b

	27 * row: a d

	28 * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be

	29 * whichever of a, b, or c is closest to p=a+b-c.

	30 */

	31

	32 static __m128i load4(const void* p) {

	33 return _mm_cvtsi32_si128((const int)p);

	34 }

	35

	36 static void store4(void* p, __m128i v) {

	37 (int)p = _mm_cvtsi128_si32(v);

	38 }

	39

	40 static __m128i load3(const void* p) {

	41 /* We'll load 2 bytes, then 1 byte,

	42 * then mask them together, and finally load into SSE.

	43 */

	44 const png_uint_16* p01 = p;

	45 const png_byte* p2 = (const png_byte*)(p01+1);

	46

	47 png_uint_32 v012 = (png_uint_32)(*p01)

	48 \| (png_uint_32)(*p2) << 16;

	49 return load4(&v012);

	50 }

	51

	52 static void store3(void* p, __m128i v) {

	53 /* We'll pull from SSE as a 32-bit int, then write

	54 * its bottom two bytes, then its third byte.

	55 */

	56 png_uint_32 v012;

	57 store4(&v012, v);

	58

	59 png_uint_16* p01 = p;

	60 png_byte* p2 = (png_byte*)(p01+1);

	61 *p01 = v012;

	62 *p2 = v012 >> 16;

	63 }

	64

	65 void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row,

	66 png_const_bytep prev)

	67 {

	68 /* The Sub filter predicts each pixel as the previous pixel, a.

	69 * There is no pixel to the left of the first pixel. It's encoded directly.

	70 * That works with our main loop if we just say that left pixel was zero.

	71 */

	72 png_debug(1, "in png_read_filter_row_sub3_sse2");

	73 __m128i a, d = _mm_setzero_si128();

	74

	75 int rb = row_info->rowbytes;

	76 while (rb >= 4) {

	77 a = d; d = load4(row);

	78 d = _mm_add_epi8(d, a);

	79 store3(row, d);

	80

	81 row += 3;

	82 rb -= 3;

	83 }

	84 if (rb > 0) {

	85 a = d; d = load3(row);

	86 d = _mm_add_epi8(d, a);

	87 store3(row, d);

	88

	89 row += 3;

	90 rb -= 3;

	91 }

	92 }

	93

	94 void png_read_filter_row_sub4_sse2(png_row_infop row_info, png_bytep row,

	95 png_const_bytep prev)

	96 {

	97 /* The Sub filter predicts each pixel as the previous pixel, a.

	98 * There is no pixel to the left of the first pixel. It's encoded directly.

	99 * That works with our main loop if we just say that left pixel was zero.

	100 */

	101 png_debug(1, "in png_read_filter_row_sub4_sse2");

	102 __m128i a, d = _mm_setzero_si128();

	103

	104 int rb = row_info->rowbytes;

	105 while (rb > 0) {

	106 a = d; d = load4(row);

	107 d = _mm_add_epi8(d, a);

	108 store4(row, d);

	109

	110 row += 4;

	111 rb -= 4;

	112 }

	113 }

	114

	115 void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row,

	116 png_const_bytep prev)

	117 {

	118 /* The Avg filter predicts each pixel as the (truncated) average of a and b.

	119 * There's no pixel to the left of the first pixel. Luckily, it's

	120 * predicted to be half of the pixel above it. So again, this works

	121 * perfectly with our loop if we make sure a starts at zero.

	122 */

	123 png_debug(1, "in png_read_filter_row_avg3_sse2");

	124 const __m128i zero = _mm_setzero_si128();

	125 __m128i b;

	126 __m128i a, d = zero;

	127

	128 int rb = row_info->rowbytes;

	129 while (rb >= 4) {

	130 b = load4(prev);

	131 a = d; d = load4(row );

	132

	133 /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */

	134 __m128i avg = _mm_avg_epu8(a,b);

	135 /* ...but we can fix it up by subtracting off 1 if it rounded up. */

	136 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),

	137 _mm_set1_epi8(1)));

	138 d = _mm_add_epi8(d, avg);

	139 store3(row, d);

	140

	141 prev += 3;

	142 row += 3;

	143 rb -= 3;

	144 }

	145 if (rb > 0) {

	146 b = load3(prev);

	147 a = d; d = load3(row );

	148

	149 /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */

	150 __m128i avg = _mm_avg_epu8(a,b);

	151 /* ...but we can fix it up by subtracting off 1 if it rounded up. */

	152 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),

	153 _mm_set1_epi8(1)));

	154

	155 d = _mm_add_epi8(d, avg);

	156 store3(row, d);

	157

	158 prev += 3;

	159 row += 3;

	160 rb -= 3;

	161 }

	162 }

	163

	164 void png_read_filter_row_avg4_sse2(png_row_infop row_info, png_bytep row,

	165 png_const_bytep prev)

	166 {

	167 /* The Avg filter predicts each pixel as the (truncated) average of a and b.

	168 * There's no pixel to the left of the first pixel. Luckily, it's

	169 * predicted to be half of the pixel above it. So again, this works

	170 * perfectly with our loop if we make sure a starts at zero.

	171 */

	172 png_debug(1, "in png_read_filter_row_avg4_sse2");

	173 const __m128i zero = _mm_setzero_si128();

	174 __m128i b;

	175 __m128i a, d = zero;

	176

	177 int rb = row_info->rowbytes;

	178 while (rb > 0) {

	179 b = load4(prev);

	180 a = d; d = load4(row );

	181

	182 /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */

	183 __m128i avg = _mm_avg_epu8(a,b);

	184 /* ...but we can fix it up by subtracting off 1 if it rounded up. */

	185 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),

	186 _mm_set1_epi8(1)));

	187

	188 d = _mm_add_epi8(d, avg);

	189 store4(row, d);

	190

	191 prev += 4;

	192 row += 4;

	193 rb -= 4;

	194 }

	195 }

	196

	197 /* Returns \|x\| for 16-bit lanes. */

	198 static __m128i abs_i16(__m128i x) {

	199 #if PNG_INTEL_SSE_IMPLEMENTATION >= 2

	200 return _mm_abs_epi16(x);

	201 #else

	202 /* Read this all as, return x<0 ? -x : x.

	203 * To negate two's complement, you flip all the bits then add 1.

	204 */

	205 __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128());

	206

	207 /* Flip negative lanes. */

	208 x = _mm_xor_si128(x, is_negative);

	209

	210 /* +1 to negative lanes, else +0. */

	211 x = _mm_add_epi16(x, _mm_srli_epi16(is_negative, 15));

	212 return x;

	213 #endif

	214 }

	215

	216 /* Bytewise c ? t : e. */

	217 static __m128i if_then_else(__m128i c, __m128i t, __m128i e) {

	218 #if PNG_INTEL_SSE_IMPLEMENTATION >= 3

	219 return _mm_blendv_epi8(e,t,c);

	220 #else

	221 return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e));

	222 #endif

	223 }

	224

	225 void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row,

	226 png_const_bytep prev)

	227 {

	228 /* Paeth tries to predict pixel d using the pixel to the left of it, a,

	229 * and two pixels from the previous row, b and c:

	230 * prev: c b

	231 * row: a d

	232 * The Paeth function predicts d to be whichever of a, b, or c is nearest to

	233 * p=a+b-c.

	234 *

	235 * The first pixel has no left context, and so uses an Up filter, p = b.

	236 * This works naturally with our main loop's p = a+b-c if we force a and c

	237 * to zero.

	238 * Here we zero b and d, which become c and a respectively at the start of

	239 * the loop.

	240 */

	241 png_debug(1, "in png_read_filter_row_paeth3_sse2");

	242 const __m128i zero = _mm_setzero_si128();

	243 __m128i c, b = zero,

	244 a, d = zero;

	245

	246 int rb = row_info->rowbytes;

	247 while (rb >= 4) {

	248 /* It's easiest to do this math (particularly, deal with pc) with 16-bit

	249 * intermediates.

	250 */

	251 c = b; b = _mm_unpacklo_epi8(load4(prev), zero);

	252 a = d; d = _mm_unpacklo_epi8(load4(row ), zero);

	253

	254 /* (p-a) == (a+b-c - a) == (b-c) */

	255 __m128i pa = _mm_sub_epi16(b,c);

	256

	257 /* (p-b) == (a+b-c - b) == (a-c) */

	258 __m128i pb = _mm_sub_epi16(a,c);

	259

	260 /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */

	261 __m128i pc = _mm_add_epi16(pa,pb);

	262

	263 pa = abs_i16(pa); /* \|p-a\| */

	264 pb = abs_i16(pb); /* \|p-b\| */

	265 pc = abs_i16(pc); /* \|p-c\| */

	266

	267 __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));

	268

	269 /* Paeth breaks ties favoring a over b over c. */

	270 __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,

	271 if_then_else(_mm_cmpeq_epi16(smallest, pb), b,

	272 c));

	273

	274 /* Note `_epi8`: we need addition to wrap modulo 255. */

	275 d = _mm_add_epi8(d, nearest);

	276 store3(row, _mm_packus_epi16(d,d));

	277

	278 prev += 3;

	279 row += 3;

	280 rb -= 3;

	281 }

	282 if (rb > 0) {

	283 /* It's easiest to do this math (particularly, deal with pc) with 16-bit

	284 * intermediates.

	285 */

	286 c = b; b = _mm_unpacklo_epi8(load3(prev), zero);

	287 a = d; d = _mm_unpacklo_epi8(load3(row ), zero);

	288

	289 /* (p-a) == (a+b-c - a) == (b-c) */

	290 __m128i pa = _mm_sub_epi16(b,c);

	291

	292 /* (p-b) == (a+b-c - b) == (a-c) */

	293 __m128i pb = _mm_sub_epi16(a,c);

	294

	295 /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */

	296 __m128i pc = _mm_add_epi16(pa,pb);

	297

	298 pa = abs_i16(pa); /* \|p-a\| */

	299 pb = abs_i16(pb); /* \|p-b\| */

	300 pc = abs_i16(pc); /* \|p-c\| */

	301

	302 __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));

	303

	304 /* Paeth breaks ties favoring a over b over c. */

	305 __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,

	306 if_then_else(_mm_cmpeq_epi16(smallest, pb), b,

	307 c));

	308

	309 /* Note `_epi8`: we need addition to wrap modulo 255. */

	310 d = _mm_add_epi8(d, nearest);

	311 store3(row, _mm_packus_epi16(d,d));

	312

	313 prev += 3;

	314 row += 3;

	315 rb -= 3;

	316 }

	317 }

	318

	319 void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row,

	320 png_const_bytep prev)

	321 {

	322 /* Paeth tries to predict pixel d using the pixel to the left of it, a,

	323 * and two pixels from the previous row, b and c:

	324 * prev: c b

	325 * row: a d

	326 * The Paeth function predicts d to be whichever of a, b, or c is nearest to

	327 * p=a+b-c.

	328 *

	329 * The first pixel has no left context, and so uses an Up filter, p = b.

	330 * This works naturally with our main loop's p = a+b-c if we force a and c

	331 * to zero.

	332 * Here we zero b and d, which become c and a respectively at the start of

	333 * the loop.

	334 */

	335 png_debug(1, "in png_read_filter_row_paeth4_sse2");

	336 const __m128i zero = _mm_setzero_si128();

	337 __m128i c, b = zero,

	338 a, d = zero;

	339

	340 int rb = row_info->rowbytes;

	341 while (rb > 0) {

	342 /* It's easiest to do this math (particularly, deal with pc) with 16-bit

	343 * intermediates.

	344 */

	345 c = b; b = _mm_unpacklo_epi8(load4(prev), zero);

	346 a = d; d = _mm_unpacklo_epi8(load4(row ), zero);

	347

	348 /* (p-a) == (a+b-c - a) == (b-c) */

	349 __m128i pa = _mm_sub_epi16(b,c);

	350

	351 /* (p-b) == (a+b-c - b) == (a-c) */

	352 __m128i pb = _mm_sub_epi16(a,c);

	353

	354 /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */

	355 __m128i pc = _mm_add_epi16(pa,pb);

	356

	357 pa = abs_i16(pa); /* \|p-a\| */

	358 pb = abs_i16(pb); /* \|p-b\| */

	359 pc = abs_i16(pc); /* \|p-c\| */

	360

	361 __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));

	362

	363 /* Paeth breaks ties favoring a over b over c. */

	364 __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,

	365 if_then_else(_mm_cmpeq_epi16(smallest, pb), b,

	366 c));

	367

	368 /* Note `_epi8`: we need addition to wrap modulo 255. */

	369 d = _mm_add_epi8(d, nearest);

	370 store4(row, _mm_packus_epi16(d,d));

	371

	372 prev += 4;

	373 row += 4;

	374 rb -= 4;

	375 }

	376 }

	377

	378 #endif /* PNG_INTEL_SSE_IMPLEMENTATION > 0 */

	379 #endif /* READ */

OLD	NEW

« no previous file with comments | « third_party/libpng/contrib/intel/INSTALL ('k') | third_party/libpng/contrib/intel/intel_init.c » ('j') | no next file with comments »