OLD | NEW |
(Empty) | |
| 1 |
| 2 /* filter_neon_intrinsics.c - NEON optimised filter functions |
| 3 * |
| 4 * Copyright (c) 2014,2016 Glenn Randers-Pehrson |
| 5 * Written by James Yu <james.yu at linaro.org>, October 2013. |
| 6 * Based on filter_neon.S, written by Mans Rullgard, 2011. |
| 7 * |
| 8 * Last changed in libpng 1.6.22 [May 26, 2016] |
| 9 * |
| 10 * This code is released under the libpng license. |
| 11 * For conditions of distribution and use, see the disclaimer |
| 12 * and license in png.h |
| 13 */ |
| 14 |
| 15 #include "../pngpriv.h" |
| 16 |
| 17 #ifdef PNG_READ_SUPPORTED |
| 18 |
| 19 /* This code requires -mfpu=neon on the command line: */ |
| 20 #if PNG_ARM_NEON_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */ |
| 21 |
| 22 #include <arm_neon.h> |
| 23 |
| 24 /* libpng row pointers are not necessarily aligned to any particular boundary, |
| 25 * however this code will only work with appropriate alignment. arm/arm_init.c |
| 26 * checks for this (and will not compile unless it is done). This code uses |
| 27 * variants of png_aligncast to avoid compiler warnings. |
| 28 */ |
| 29 #define png_ptr(type,pointer) png_aligncast(type *,pointer) |
| 30 #define png_ptrc(type,pointer) png_aligncastconst(const type *,pointer) |
| 31 |
| 32 /* The following relies on a variable 'temp_pointer' being declared with type |
| 33 * 'type'. This is written this way just to hide the GCC strict aliasing |
| 34 * warning; note that the code is safe because there never is an alias between |
| 35 * the input and output pointers. |
| 36 */ |
| 37 #define png_ldr(type,pointer)\ |
| 38 (temp_pointer = png_ptr(type,pointer), *temp_pointer) |
| 39 |
| 40 #if PNG_ARM_NEON_OPT > 0 |
| 41 |
| 42 void |
| 43 png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row, |
| 44 png_const_bytep prev_row) |
| 45 { |
| 46 png_bytep rp = row; |
| 47 png_bytep rp_stop = row + row_info->rowbytes; |
| 48 png_const_bytep pp = prev_row; |
| 49 |
| 50 png_debug(1, "in png_read_filter_row_up_neon"); |
| 51 |
| 52 for (; rp < rp_stop; rp += 16, pp += 16) |
| 53 { |
| 54 uint8x16_t qrp, qpp; |
| 55 |
| 56 qrp = vld1q_u8(rp); |
| 57 qpp = vld1q_u8(pp); |
| 58 qrp = vaddq_u8(qrp, qpp); |
| 59 vst1q_u8(rp, qrp); |
| 60 } |
| 61 } |
| 62 |
| 63 void |
| 64 png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row, |
| 65 png_const_bytep prev_row) |
| 66 { |
| 67 png_bytep rp = row; |
| 68 png_bytep rp_stop = row + row_info->rowbytes; |
| 69 |
| 70 uint8x16_t vtmp = vld1q_u8(rp); |
| 71 uint8x8x2_t *vrpt = png_ptr(uint8x8x2_t, &vtmp); |
| 72 uint8x8x2_t vrp = *vrpt; |
| 73 |
| 74 uint8x8x4_t vdest; |
| 75 vdest.val[3] = vdup_n_u8(0); |
| 76 |
| 77 png_debug(1, "in png_read_filter_row_sub3_neon"); |
| 78 |
| 79 for (; rp < rp_stop;) |
| 80 { |
| 81 uint8x8_t vtmp1, vtmp2; |
| 82 uint32x2_t *temp_pointer; |
| 83 |
| 84 vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); |
| 85 vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]); |
| 86 vtmp2 = vext_u8(vrp.val[0], vrp.val[1], 6); |
| 87 vdest.val[1] = vadd_u8(vdest.val[0], vtmp1); |
| 88 |
| 89 vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); |
| 90 vdest.val[2] = vadd_u8(vdest.val[1], vtmp2); |
| 91 vdest.val[3] = vadd_u8(vdest.val[2], vtmp1); |
| 92 |
| 93 vtmp = vld1q_u8(rp + 12); |
| 94 vrpt = png_ptr(uint8x8x2_t, &vtmp); |
| 95 vrp = *vrpt; |
| 96 |
| 97 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); |
| 98 rp += 3; |
| 99 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); |
| 100 rp += 3; |
| 101 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0); |
| 102 rp += 3; |
| 103 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0); |
| 104 rp += 3; |
| 105 } |
| 106 |
| 107 PNG_UNUSED(prev_row) |
| 108 } |
| 109 |
| 110 void |
| 111 png_read_filter_row_sub4_neon(png_row_infop row_info, png_bytep row, |
| 112 png_const_bytep prev_row) |
| 113 { |
| 114 png_bytep rp = row; |
| 115 png_bytep rp_stop = row + row_info->rowbytes; |
| 116 |
| 117 uint8x8x4_t vdest; |
| 118 vdest.val[3] = vdup_n_u8(0); |
| 119 |
| 120 png_debug(1, "in png_read_filter_row_sub4_neon"); |
| 121 |
| 122 for (; rp < rp_stop; rp += 16) |
| 123 { |
| 124 uint32x2x4_t vtmp = vld4_u32(png_ptr(uint32_t,rp)); |
| 125 uint8x8x4_t *vrpt = png_ptr(uint8x8x4_t,&vtmp); |
| 126 uint8x8x4_t vrp = *vrpt; |
| 127 uint32x2x4_t *temp_pointer; |
| 128 |
| 129 vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]); |
| 130 vdest.val[1] = vadd_u8(vdest.val[0], vrp.val[1]); |
| 131 vdest.val[2] = vadd_u8(vdest.val[1], vrp.val[2]); |
| 132 vdest.val[3] = vadd_u8(vdest.val[2], vrp.val[3]); |
| 133 vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0); |
| 134 } |
| 135 |
| 136 PNG_UNUSED(prev_row) |
| 137 } |
| 138 |
| 139 void |
| 140 png_read_filter_row_avg3_neon(png_row_infop row_info, png_bytep row, |
| 141 png_const_bytep prev_row) |
| 142 { |
| 143 png_bytep rp = row; |
| 144 png_const_bytep pp = prev_row; |
| 145 png_bytep rp_stop = row + row_info->rowbytes; |
| 146 |
| 147 uint8x16_t vtmp; |
| 148 uint8x8x2_t *vrpt; |
| 149 uint8x8x2_t vrp; |
| 150 uint8x8x4_t vdest; |
| 151 vdest.val[3] = vdup_n_u8(0); |
| 152 |
| 153 vtmp = vld1q_u8(rp); |
| 154 vrpt = png_ptr(uint8x8x2_t,&vtmp); |
| 155 vrp = *vrpt; |
| 156 |
| 157 png_debug(1, "in png_read_filter_row_avg3_neon"); |
| 158 |
| 159 for (; rp < rp_stop; pp += 12) |
| 160 { |
| 161 uint8x8_t vtmp1, vtmp2, vtmp3; |
| 162 |
| 163 uint8x8x2_t *vppt; |
| 164 uint8x8x2_t vpp; |
| 165 |
| 166 uint32x2_t *temp_pointer; |
| 167 |
| 168 vtmp = vld1q_u8(pp); |
| 169 vppt = png_ptr(uint8x8x2_t,&vtmp); |
| 170 vpp = *vppt; |
| 171 |
| 172 vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); |
| 173 vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]); |
| 174 vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); |
| 175 |
| 176 vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3); |
| 177 vtmp3 = vext_u8(vrp.val[0], vrp.val[1], 6); |
| 178 vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2); |
| 179 vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); |
| 180 |
| 181 vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 6); |
| 182 vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); |
| 183 |
| 184 vtmp = vld1q_u8(rp + 12); |
| 185 vrpt = png_ptr(uint8x8x2_t,&vtmp); |
| 186 vrp = *vrpt; |
| 187 |
| 188 vdest.val[2] = vhadd_u8(vdest.val[1], vtmp2); |
| 189 vdest.val[2] = vadd_u8(vdest.val[2], vtmp3); |
| 190 |
| 191 vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1); |
| 192 |
| 193 vdest.val[3] = vhadd_u8(vdest.val[2], vtmp2); |
| 194 vdest.val[3] = vadd_u8(vdest.val[3], vtmp1); |
| 195 |
| 196 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); |
| 197 rp += 3; |
| 198 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); |
| 199 rp += 3; |
| 200 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0); |
| 201 rp += 3; |
| 202 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0); |
| 203 rp += 3; |
| 204 } |
| 205 } |
| 206 |
| 207 void |
| 208 png_read_filter_row_avg4_neon(png_row_infop row_info, png_bytep row, |
| 209 png_const_bytep prev_row) |
| 210 { |
| 211 png_bytep rp = row; |
| 212 png_bytep rp_stop = row + row_info->rowbytes; |
| 213 png_const_bytep pp = prev_row; |
| 214 |
| 215 uint8x8x4_t vdest; |
| 216 vdest.val[3] = vdup_n_u8(0); |
| 217 |
| 218 png_debug(1, "in png_read_filter_row_avg4_neon"); |
| 219 |
| 220 for (; rp < rp_stop; rp += 16, pp += 16) |
| 221 { |
| 222 uint32x2x4_t vtmp; |
| 223 uint8x8x4_t *vrpt, *vppt; |
| 224 uint8x8x4_t vrp, vpp; |
| 225 uint32x2x4_t *temp_pointer; |
| 226 |
| 227 vtmp = vld4_u32(png_ptr(uint32_t,rp)); |
| 228 vrpt = png_ptr(uint8x8x4_t,&vtmp); |
| 229 vrp = *vrpt; |
| 230 vtmp = vld4_u32(png_ptrc(uint32_t,pp)); |
| 231 vppt = png_ptr(uint8x8x4_t,&vtmp); |
| 232 vpp = *vppt; |
| 233 |
| 234 vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]); |
| 235 vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); |
| 236 vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]); |
| 237 vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]); |
| 238 vdest.val[2] = vhadd_u8(vdest.val[1], vpp.val[2]); |
| 239 vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]); |
| 240 vdest.val[3] = vhadd_u8(vdest.val[2], vpp.val[3]); |
| 241 vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]); |
| 242 |
| 243 vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0); |
| 244 } |
| 245 } |
| 246 |
| 247 static uint8x8_t |
| 248 paeth(uint8x8_t a, uint8x8_t b, uint8x8_t c) |
| 249 { |
| 250 uint8x8_t d, e; |
| 251 uint16x8_t p1, pa, pb, pc; |
| 252 |
| 253 p1 = vaddl_u8(a, b); /* a + b */ |
| 254 pc = vaddl_u8(c, c); /* c * 2 */ |
| 255 pa = vabdl_u8(b, c); /* pa */ |
| 256 pb = vabdl_u8(a, c); /* pb */ |
| 257 pc = vabdq_u16(p1, pc); /* pc */ |
| 258 |
| 259 p1 = vcleq_u16(pa, pb); /* pa <= pb */ |
| 260 pa = vcleq_u16(pa, pc); /* pa <= pc */ |
| 261 pb = vcleq_u16(pb, pc); /* pb <= pc */ |
| 262 |
| 263 p1 = vandq_u16(p1, pa); /* pa <= pb && pa <= pc */ |
| 264 |
| 265 d = vmovn_u16(pb); |
| 266 e = vmovn_u16(p1); |
| 267 |
| 268 d = vbsl_u8(d, b, c); |
| 269 e = vbsl_u8(e, a, d); |
| 270 |
| 271 return e; |
| 272 } |
| 273 |
| 274 void |
| 275 png_read_filter_row_paeth3_neon(png_row_infop row_info, png_bytep row, |
| 276 png_const_bytep prev_row) |
| 277 { |
| 278 png_bytep rp = row; |
| 279 png_const_bytep pp = prev_row; |
| 280 png_bytep rp_stop = row + row_info->rowbytes; |
| 281 |
| 282 uint8x16_t vtmp; |
| 283 uint8x8x2_t *vrpt; |
| 284 uint8x8x2_t vrp; |
| 285 uint8x8_t vlast = vdup_n_u8(0); |
| 286 uint8x8x4_t vdest; |
| 287 vdest.val[3] = vdup_n_u8(0); |
| 288 |
| 289 vtmp = vld1q_u8(rp); |
| 290 vrpt = png_ptr(uint8x8x2_t,&vtmp); |
| 291 vrp = *vrpt; |
| 292 |
| 293 png_debug(1, "in png_read_filter_row_paeth3_neon"); |
| 294 |
| 295 for (; rp < rp_stop; pp += 12) |
| 296 { |
| 297 uint8x8x2_t *vppt; |
| 298 uint8x8x2_t vpp; |
| 299 uint8x8_t vtmp1, vtmp2, vtmp3; |
| 300 uint32x2_t *temp_pointer; |
| 301 |
| 302 vtmp = vld1q_u8(pp); |
| 303 vppt = png_ptr(uint8x8x2_t,&vtmp); |
| 304 vpp = *vppt; |
| 305 |
| 306 vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast); |
| 307 vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); |
| 308 |
| 309 vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3); |
| 310 vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3); |
| 311 vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]); |
| 312 vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); |
| 313 |
| 314 vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 6); |
| 315 vtmp3 = vext_u8(vpp.val[0], vpp.val[1], 6); |
| 316 vdest.val[2] = paeth(vdest.val[1], vtmp3, vtmp2); |
| 317 vdest.val[2] = vadd_u8(vdest.val[2], vtmp1); |
| 318 |
| 319 vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); |
| 320 vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1); |
| 321 |
| 322 vtmp = vld1q_u8(rp + 12); |
| 323 vrpt = png_ptr(uint8x8x2_t,&vtmp); |
| 324 vrp = *vrpt; |
| 325 |
| 326 vdest.val[3] = paeth(vdest.val[2], vtmp2, vtmp3); |
| 327 vdest.val[3] = vadd_u8(vdest.val[3], vtmp1); |
| 328 |
| 329 vlast = vtmp2; |
| 330 |
| 331 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0); |
| 332 rp += 3; |
| 333 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0); |
| 334 rp += 3; |
| 335 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0); |
| 336 rp += 3; |
| 337 vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0); |
| 338 rp += 3; |
| 339 } |
| 340 } |
| 341 |
| 342 void |
| 343 png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row, |
| 344 png_const_bytep prev_row) |
| 345 { |
| 346 png_bytep rp = row; |
| 347 png_bytep rp_stop = row + row_info->rowbytes; |
| 348 png_const_bytep pp = prev_row; |
| 349 |
| 350 uint8x8_t vlast = vdup_n_u8(0); |
| 351 uint8x8x4_t vdest; |
| 352 vdest.val[3] = vdup_n_u8(0); |
| 353 |
| 354 png_debug(1, "in png_read_filter_row_paeth4_neon"); |
| 355 |
| 356 for (; rp < rp_stop; rp += 16, pp += 16) |
| 357 { |
| 358 uint32x2x4_t vtmp; |
| 359 uint8x8x4_t *vrpt, *vppt; |
| 360 uint8x8x4_t vrp, vpp; |
| 361 uint32x2x4_t *temp_pointer; |
| 362 |
| 363 vtmp = vld4_u32(png_ptr(uint32_t,rp)); |
| 364 vrpt = png_ptr(uint8x8x4_t,&vtmp); |
| 365 vrp = *vrpt; |
| 366 vtmp = vld4_u32(png_ptrc(uint32_t,pp)); |
| 367 vppt = png_ptr(uint8x8x4_t,&vtmp); |
| 368 vpp = *vppt; |
| 369 |
| 370 vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast); |
| 371 vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); |
| 372 vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]); |
| 373 vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]); |
| 374 vdest.val[2] = paeth(vdest.val[1], vpp.val[2], vpp.val[1]); |
| 375 vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]); |
| 376 vdest.val[3] = paeth(vdest.val[2], vpp.val[3], vpp.val[2]); |
| 377 vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]); |
| 378 |
| 379 vlast = vpp.val[3]; |
| 380 |
| 381 vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0); |
| 382 } |
| 383 } |
| 384 |
| 385 #endif /* PNG_ARM_NEON_OPT > 0 */ |
| 386 #endif /* PNG_ARM_NEON_IMPLEMENTATION == 1 (intrinsics) */ |
| 387 #endif /* READ */ |
OLD | NEW |