Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(343)

Side by Side Diff: third_party/libpng/arm/filter_neon_intrinsics.c

Issue 2021403002: Update libpng to 1.6.22 (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Rearrange pnglibconf.h Created 4 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/libpng/arm/filter_neon.S ('k') | third_party/libpng/contrib/intel/INSTALL » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1
2 /* filter_neon_intrinsics.c - NEON optimised filter functions
3 *
4 * Copyright (c) 2014,2016 Glenn Randers-Pehrson
5 * Written by James Yu <james.yu at linaro.org>, October 2013.
6 * Based on filter_neon.S, written by Mans Rullgard, 2011.
7 *
8 * Last changed in libpng 1.6.22 [May 26, 2016]
9 *
10 * This code is released under the libpng license.
11 * For conditions of distribution and use, see the disclaimer
12 * and license in png.h
13 */
14
15 #include "../pngpriv.h"
16
17 #ifdef PNG_READ_SUPPORTED
18
19 /* This code requires -mfpu=neon on the command line: */
20 #if PNG_ARM_NEON_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */
21
22 #include <arm_neon.h>
23
24 /* libpng row pointers are not necessarily aligned to any particular boundary,
25 * however this code will only work with appropriate alignment. arm/arm_init.c
26 * checks for this (and will not compile unless it is done). This code uses
27 * variants of png_aligncast to avoid compiler warnings.
28 */
29 #define png_ptr(type,pointer) png_aligncast(type *,pointer)
30 #define png_ptrc(type,pointer) png_aligncastconst(const type *,pointer)
31
32 /* The following relies on a variable 'temp_pointer' being declared with type
33 * 'type'. This is written this way just to hide the GCC strict aliasing
34 * warning; note that the code is safe because there never is an alias between
35 * the input and output pointers.
36 */
37 #define png_ldr(type,pointer)\
38 (temp_pointer = png_ptr(type,pointer), *temp_pointer)
39
40 #if PNG_ARM_NEON_OPT > 0
41
42 void
43 png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row,
44 png_const_bytep prev_row)
45 {
46 png_bytep rp = row;
47 png_bytep rp_stop = row + row_info->rowbytes;
48 png_const_bytep pp = prev_row;
49
50 png_debug(1, "in png_read_filter_row_up_neon");
51
52 for (; rp < rp_stop; rp += 16, pp += 16)
53 {
54 uint8x16_t qrp, qpp;
55
56 qrp = vld1q_u8(rp);
57 qpp = vld1q_u8(pp);
58 qrp = vaddq_u8(qrp, qpp);
59 vst1q_u8(rp, qrp);
60 }
61 }
62
63 void
64 png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row,
65 png_const_bytep prev_row)
66 {
67 png_bytep rp = row;
68 png_bytep rp_stop = row + row_info->rowbytes;
69
70 uint8x16_t vtmp = vld1q_u8(rp);
71 uint8x8x2_t *vrpt = png_ptr(uint8x8x2_t, &vtmp);
72 uint8x8x2_t vrp = *vrpt;
73
74 uint8x8x4_t vdest;
75 vdest.val[3] = vdup_n_u8(0);
76
77 png_debug(1, "in png_read_filter_row_sub3_neon");
78
79 for (; rp < rp_stop;)
80 {
81 uint8x8_t vtmp1, vtmp2;
82 uint32x2_t *temp_pointer;
83
84 vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
85 vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
86 vtmp2 = vext_u8(vrp.val[0], vrp.val[1], 6);
87 vdest.val[1] = vadd_u8(vdest.val[0], vtmp1);
88
89 vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
90 vdest.val[2] = vadd_u8(vdest.val[1], vtmp2);
91 vdest.val[3] = vadd_u8(vdest.val[2], vtmp1);
92
93 vtmp = vld1q_u8(rp + 12);
94 vrpt = png_ptr(uint8x8x2_t, &vtmp);
95 vrp = *vrpt;
96
97 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
98 rp += 3;
99 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
100 rp += 3;
101 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
102 rp += 3;
103 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
104 rp += 3;
105 }
106
107 PNG_UNUSED(prev_row)
108 }
109
110 void
111 png_read_filter_row_sub4_neon(png_row_infop row_info, png_bytep row,
112 png_const_bytep prev_row)
113 {
114 png_bytep rp = row;
115 png_bytep rp_stop = row + row_info->rowbytes;
116
117 uint8x8x4_t vdest;
118 vdest.val[3] = vdup_n_u8(0);
119
120 png_debug(1, "in png_read_filter_row_sub4_neon");
121
122 for (; rp < rp_stop; rp += 16)
123 {
124 uint32x2x4_t vtmp = vld4_u32(png_ptr(uint32_t,rp));
125 uint8x8x4_t *vrpt = png_ptr(uint8x8x4_t,&vtmp);
126 uint8x8x4_t vrp = *vrpt;
127 uint32x2x4_t *temp_pointer;
128
129 vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
130 vdest.val[1] = vadd_u8(vdest.val[0], vrp.val[1]);
131 vdest.val[2] = vadd_u8(vdest.val[1], vrp.val[2]);
132 vdest.val[3] = vadd_u8(vdest.val[2], vrp.val[3]);
133 vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
134 }
135
136 PNG_UNUSED(prev_row)
137 }
138
139 void
140 png_read_filter_row_avg3_neon(png_row_infop row_info, png_bytep row,
141 png_const_bytep prev_row)
142 {
143 png_bytep rp = row;
144 png_const_bytep pp = prev_row;
145 png_bytep rp_stop = row + row_info->rowbytes;
146
147 uint8x16_t vtmp;
148 uint8x8x2_t *vrpt;
149 uint8x8x2_t vrp;
150 uint8x8x4_t vdest;
151 vdest.val[3] = vdup_n_u8(0);
152
153 vtmp = vld1q_u8(rp);
154 vrpt = png_ptr(uint8x8x2_t,&vtmp);
155 vrp = *vrpt;
156
157 png_debug(1, "in png_read_filter_row_avg3_neon");
158
159 for (; rp < rp_stop; pp += 12)
160 {
161 uint8x8_t vtmp1, vtmp2, vtmp3;
162
163 uint8x8x2_t *vppt;
164 uint8x8x2_t vpp;
165
166 uint32x2_t *temp_pointer;
167
168 vtmp = vld1q_u8(pp);
169 vppt = png_ptr(uint8x8x2_t,&vtmp);
170 vpp = *vppt;
171
172 vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
173 vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
174 vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
175
176 vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
177 vtmp3 = vext_u8(vrp.val[0], vrp.val[1], 6);
178 vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
179 vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
180
181 vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 6);
182 vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
183
184 vtmp = vld1q_u8(rp + 12);
185 vrpt = png_ptr(uint8x8x2_t,&vtmp);
186 vrp = *vrpt;
187
188 vdest.val[2] = vhadd_u8(vdest.val[1], vtmp2);
189 vdest.val[2] = vadd_u8(vdest.val[2], vtmp3);
190
191 vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
192
193 vdest.val[3] = vhadd_u8(vdest.val[2], vtmp2);
194 vdest.val[3] = vadd_u8(vdest.val[3], vtmp1);
195
196 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
197 rp += 3;
198 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
199 rp += 3;
200 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
201 rp += 3;
202 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
203 rp += 3;
204 }
205 }
206
207 void
208 png_read_filter_row_avg4_neon(png_row_infop row_info, png_bytep row,
209 png_const_bytep prev_row)
210 {
211 png_bytep rp = row;
212 png_bytep rp_stop = row + row_info->rowbytes;
213 png_const_bytep pp = prev_row;
214
215 uint8x8x4_t vdest;
216 vdest.val[3] = vdup_n_u8(0);
217
218 png_debug(1, "in png_read_filter_row_avg4_neon");
219
220 for (; rp < rp_stop; rp += 16, pp += 16)
221 {
222 uint32x2x4_t vtmp;
223 uint8x8x4_t *vrpt, *vppt;
224 uint8x8x4_t vrp, vpp;
225 uint32x2x4_t *temp_pointer;
226
227 vtmp = vld4_u32(png_ptr(uint32_t,rp));
228 vrpt = png_ptr(uint8x8x4_t,&vtmp);
229 vrp = *vrpt;
230 vtmp = vld4_u32(png_ptrc(uint32_t,pp));
231 vppt = png_ptr(uint8x8x4_t,&vtmp);
232 vpp = *vppt;
233
234 vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
235 vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
236 vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]);
237 vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
238 vdest.val[2] = vhadd_u8(vdest.val[1], vpp.val[2]);
239 vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]);
240 vdest.val[3] = vhadd_u8(vdest.val[2], vpp.val[3]);
241 vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]);
242
243 vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
244 }
245 }
246
247 static uint8x8_t
248 paeth(uint8x8_t a, uint8x8_t b, uint8x8_t c)
249 {
250 uint8x8_t d, e;
251 uint16x8_t p1, pa, pb, pc;
252
253 p1 = vaddl_u8(a, b); /* a + b */
254 pc = vaddl_u8(c, c); /* c * 2 */
255 pa = vabdl_u8(b, c); /* pa */
256 pb = vabdl_u8(a, c); /* pb */
257 pc = vabdq_u16(p1, pc); /* pc */
258
259 p1 = vcleq_u16(pa, pb); /* pa <= pb */
260 pa = vcleq_u16(pa, pc); /* pa <= pc */
261 pb = vcleq_u16(pb, pc); /* pb <= pc */
262
263 p1 = vandq_u16(p1, pa); /* pa <= pb && pa <= pc */
264
265 d = vmovn_u16(pb);
266 e = vmovn_u16(p1);
267
268 d = vbsl_u8(d, b, c);
269 e = vbsl_u8(e, a, d);
270
271 return e;
272 }
273
274 void
275 png_read_filter_row_paeth3_neon(png_row_infop row_info, png_bytep row,
276 png_const_bytep prev_row)
277 {
278 png_bytep rp = row;
279 png_const_bytep pp = prev_row;
280 png_bytep rp_stop = row + row_info->rowbytes;
281
282 uint8x16_t vtmp;
283 uint8x8x2_t *vrpt;
284 uint8x8x2_t vrp;
285 uint8x8_t vlast = vdup_n_u8(0);
286 uint8x8x4_t vdest;
287 vdest.val[3] = vdup_n_u8(0);
288
289 vtmp = vld1q_u8(rp);
290 vrpt = png_ptr(uint8x8x2_t,&vtmp);
291 vrp = *vrpt;
292
293 png_debug(1, "in png_read_filter_row_paeth3_neon");
294
295 for (; rp < rp_stop; pp += 12)
296 {
297 uint8x8x2_t *vppt;
298 uint8x8x2_t vpp;
299 uint8x8_t vtmp1, vtmp2, vtmp3;
300 uint32x2_t *temp_pointer;
301
302 vtmp = vld1q_u8(pp);
303 vppt = png_ptr(uint8x8x2_t,&vtmp);
304 vpp = *vppt;
305
306 vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
307 vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
308
309 vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
310 vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
311 vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
312 vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
313
314 vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 6);
315 vtmp3 = vext_u8(vpp.val[0], vpp.val[1], 6);
316 vdest.val[2] = paeth(vdest.val[1], vtmp3, vtmp2);
317 vdest.val[2] = vadd_u8(vdest.val[2], vtmp1);
318
319 vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
320 vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
321
322 vtmp = vld1q_u8(rp + 12);
323 vrpt = png_ptr(uint8x8x2_t,&vtmp);
324 vrp = *vrpt;
325
326 vdest.val[3] = paeth(vdest.val[2], vtmp2, vtmp3);
327 vdest.val[3] = vadd_u8(vdest.val[3], vtmp1);
328
329 vlast = vtmp2;
330
331 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
332 rp += 3;
333 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
334 rp += 3;
335 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
336 rp += 3;
337 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
338 rp += 3;
339 }
340 }
341
342 void
343 png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row,
344 png_const_bytep prev_row)
345 {
346 png_bytep rp = row;
347 png_bytep rp_stop = row + row_info->rowbytes;
348 png_const_bytep pp = prev_row;
349
350 uint8x8_t vlast = vdup_n_u8(0);
351 uint8x8x4_t vdest;
352 vdest.val[3] = vdup_n_u8(0);
353
354 png_debug(1, "in png_read_filter_row_paeth4_neon");
355
356 for (; rp < rp_stop; rp += 16, pp += 16)
357 {
358 uint32x2x4_t vtmp;
359 uint8x8x4_t *vrpt, *vppt;
360 uint8x8x4_t vrp, vpp;
361 uint32x2x4_t *temp_pointer;
362
363 vtmp = vld4_u32(png_ptr(uint32_t,rp));
364 vrpt = png_ptr(uint8x8x4_t,&vtmp);
365 vrp = *vrpt;
366 vtmp = vld4_u32(png_ptrc(uint32_t,pp));
367 vppt = png_ptr(uint8x8x4_t,&vtmp);
368 vpp = *vppt;
369
370 vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
371 vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
372 vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]);
373 vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
374 vdest.val[2] = paeth(vdest.val[1], vpp.val[2], vpp.val[1]);
375 vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]);
376 vdest.val[3] = paeth(vdest.val[2], vpp.val[3], vpp.val[2]);
377 vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]);
378
379 vlast = vpp.val[3];
380
381 vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
382 }
383 }
384
385 #endif /* PNG_ARM_NEON_OPT > 0 */
386 #endif /* PNG_ARM_NEON_IMPLEMENTATION == 1 (intrinsics) */
387 #endif /* READ */
OLDNEW
« no previous file with comments | « third_party/libpng/arm/filter_neon.S ('k') | third_party/libpng/contrib/intel/INSTALL » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698