OLD | NEW |
(Empty) | |
| 1 diff --git a/Makefile.in b/Makefile.in |
| 2 index b9c41f0..8472c8a 100644 |
| 3 --- a/Makefile.in |
| 4 +++ b/Makefile.in |
| 5 @@ -99,7 +99,8 @@ am__objects_1 = libpng_la-png.lo libpng_la-pngset.lo \ |
| 6 libpng_la-pngread.lo libpng_la-pngrio.lo libpng_la-pngwio.lo \ |
| 7 libpng_la-pngwrite.lo libpng_la-pngrtran.lo \ |
| 8 libpng_la-pngwtran.lo libpng_la-pngmem.lo \ |
| 9 - libpng_la-pngerror.lo libpng_la-pngpread.lo |
| 10 + libpng_la-pngerror.lo libpng_la-pngpread.lo \ |
| 11 + libpng_la-png_read_filter_row_neon.lo |
| 12 am_libpng_la_OBJECTS = $(am__objects_1) |
| 13 libpng_la_OBJECTS = $(am_libpng_la_OBJECTS) |
| 14 libpng_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \ |
| 15 @@ -113,7 +114,7 @@ am_libpng12_la_OBJECTS = libpng12_la-png.lo libpng12_la-pngs
et.lo \ |
| 16 libpng12_la-pngwio.lo libpng12_la-pngwrite.lo \ |
| 17 libpng12_la-pngrtran.lo libpng12_la-pngwtran.lo \ |
| 18 libpng12_la-pngmem.lo libpng12_la-pngerror.lo \ |
| 19 - libpng12_la-pngpread.lo |
| 20 + libpng12_la-pngpread.lo libpng12_la-png_read_filter_row_neon.lo |
| 21 libpng12_la_OBJECTS = $(am_libpng12_la_OBJECTS) |
| 22 libpng12_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \ |
| 23 $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ |
| 24 @@ -136,9 +137,9 @@ LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS
) \ |
| 25 --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \ |
| 26 $(LDFLAGS) -o $@ |
| 27 SOURCES = $(libpng_la_SOURCES) $(libpng12_la_SOURCES) \ |
| 28 - $(pngtest_SOURCES) |
| 29 + $(pngtest_SOURCES) $(pngasm_SOURCES) |
| 30 DIST_SOURCES = $(libpng_la_SOURCES) $(libpng12_la_SOURCES) \ |
| 31 - $(pngtest_SOURCES) |
| 32 + $(pngtest_SOURCES) $(pngasm_SOURCES) |
| 33 man3dir = $(mandir)/man3 |
| 34 man5dir = $(mandir)/man5 |
| 35 NROFF = nroff |
| 36 @@ -307,6 +308,8 @@ dist_man_MANS = libpng.3 libpngpf.3 png.5 |
| 37 EXTRA_SCRIPTS = libpng-config libpng12-config |
| 38 bin_SCRIPTS = @binconfigs@ |
| 39 |
| 40 +pngasm_SOURCES = png_read_filter_row_neon.S |
| 41 + |
| 42 # rules to build libpng, only build the old library on request |
| 43 lib_LTLIBRARIES = libpng12.la @compatlib@ |
| 44 EXTRA_LTLIBRARIES = libpng.la |
| 45 @@ -363,7 +366,7 @@ all: config.h |
| 46 $(MAKE) $(AM_MAKEFLAGS) all-am |
| 47 |
| 48 .SUFFIXES: |
| 49 -.SUFFIXES: .c .lo .o .obj |
| 50 +.SUFFIXES: .c .S .lo .o .obj |
| 51 am--refresh: |
| 52 @: |
| 53 $(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__conf
igure_deps) |
| 54 @@ -537,6 +540,7 @@ distclean-compile: |
| 55 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng_la-pngwrite.Plo@am__quo
te@ |
| 56 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng_la-pngwtran.Plo@am__quo
te@ |
| 57 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng_la-pngwutil.Plo@am__quo
te@ |
| 58 +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng_la-png_read_filter_row_
neon.Plo@am__quote@ |
| 59 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pngtest.Po@am__quote@ |
| 60 |
| 61 .c.o: |
| 62 @@ -553,6 +557,13 @@ distclean-compile: |
| 63 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp)
@AMDEPBACKSLASH@ |
| 64 @am__fastdepCC_FALSE@ $(COMPILE) -c `$(CYGPATH_W) '$<'` |
| 65 |
| 66 +.S.o: |
| 67 +@am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@
$< |
| 68 +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po |
| 69 +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDE
PBACKSLASH@ |
| 70 +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp)
@AMDEPBACKSLASH@ |
| 71 +@am__fastdepCC_FALSE@ $(COMPILE) -c $< |
| 72 + |
| 73 .c.lo: |
| 74 @am__fastdepCC_TRUE@ $(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $
@ $< |
| 75 @am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo |
| 76 @@ -560,6 +571,14 @@ distclean-compile: |
| 77 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp)
@AMDEPBACKSLASH@ |
| 78 @am__fastdepCC_FALSE@ $(LTCOMPILE) -c -o $@ $< |
| 79 |
| 80 +.S.lo: |
| 81 +@am__fastdepCC_TRUE@ $(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $
@ $< |
| 82 +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo |
| 83 +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=yes @AMD
EPBACKSLASH@ |
| 84 +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp)
@AMDEPBACKSLASH@ |
| 85 +@am__fastdepCC_FALSE@ $(LTCOMPILE) -c -o $@ $< |
| 86 + |
| 87 + |
| 88 libpng_la-png.lo: png.c |
| 89 @am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS)
--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng_la_CPPFLAG
S) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpng_la-png.lo -MD -MP -MF $(DEPDIR)
/libpng_la-png.Tpo -c -o libpng_la-png.lo `test -f 'png.c' || echo '$(srcdir)/'`
png.c |
| 90 @am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/libpng_la-png.Tpo $(DEPDIR)/libpng_l
a-png.Plo |
| 91 @@ -665,6 +684,16 @@ libpng_la-pngpread.lo: pngpread.c |
| 92 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp)
@AMDEPBACKSLASH@ |
| 93 @am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS)
--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng_la_CPPFLAG
S) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpng_la-pngpread.lo `test -f 'pngp
read.c' || echo '$(srcdir)/'`pngpread.c |
| 94 |
| 95 + |
| 96 + |
| 97 +libpng_la-png_read_filter_row_neon.lo: png_read_filter_row_neon.S |
| 98 +@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS)
--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng_la_CPPFLAG
S) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpng_la-png_read_filter_row_neon.lo
-MD -MP -MF $(DEPDIR)/libpng_la-png_read_filter_row_neon.Tpo -c -o libpng_la-png
_read_filter_row_neon.lo `test -f 'png_read_filter_row_neon.S' || echo '$(srcdir
)/'`png_read_filter_row_neon.S |
| 99 +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/libpng_la-png_read_filter_row_neon.T
po $(DEPDIR)/libpng_la-png_read_filter_row_neon.Plo |
| 100 +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='png_read_filter_row_neon.S' obje
ct='libpng_la-png_read_filter_row_neon.lo' libtool=yes @AMDEPBACKSLASH@ |
| 101 +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp)
@AMDEPBACKSLASH@ |
| 102 +@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS)
--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng_la_CPPFLAG
S) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpng_la-png_read_filter_row_neon.l
o `test -f 'png_read_filter_row_neon.S' || echo '$(srcdir)/'`png_read_filter_row
_neon.S |
| 103 + |
| 104 + |
| 105 libpng12_la-png.lo: png.c |
| 106 @am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS)
--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng12_la_CPPFL
AGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpng12_la-png.lo -MD -MP -MF $(DEP
DIR)/libpng12_la-png.Tpo -c -o libpng12_la-png.lo `test -f 'png.c' || echo '$(sr
cdir)/'`png.c |
| 107 @am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/libpng12_la-png.Tpo $(DEPDIR)/libpng
12_la-png.Plo |
| 108 @@ -770,6 +799,15 @@ libpng12_la-pngpread.lo: pngpread.c |
| 109 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp)
@AMDEPBACKSLASH@ |
| 110 @am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS)
--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng12_la_CPPFL
AGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpng12_la-pngpread.lo `test -f '
pngpread.c' || echo '$(srcdir)/'`pngpread.c |
| 111 |
| 112 + |
| 113 +libpng12_la-png_read_filter_row_neon.lo: png_read_filter_row_neon.S |
| 114 +@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS)
--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng12_la_CPPFL
AGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpng12_la-png_read_filter_row_neon
.lo -MD -MP -MF $(DEPDIR)/libpng12_la-png_read_filter_row_neon.Tpo -c -o libpng1
2_la-png_read_filter_row_neon.lo `test -f 'png_read_filter_row_neon.S' || echo '
$(srcdir)/'`png_read_filter_row_neon.S |
| 115 +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/libpng12_la-png_read_filter_row_neon
.Tpo $(DEPDIR)/libpng12_la-png_read_filter_row_neon.Plo |
| 116 +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='png_read_filter_row_neon' object
='libpng12_la-png_read_filter_row_neon.lo' libtool=yes @AMDEPBACKSLASH@ |
| 117 +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp)
@AMDEPBACKSLASH@ |
| 118 +@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS)
--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng12_la_CPPFL
AGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpng12_la-png_read_filter_row_ne
on.lo `test -f 'png_read_filter_row_neon.S' || echo '$(srcdir)/'`png_read_filter
_row_neon.S |
| 119 + |
| 120 + |
| 121 mostlyclean-libtool: |
| 122 -rm -f *.lo |
| 123 |
| 124 diff --git a/png_read_filter_row_neon.S b/png_read_filter_row_neon.S |
| 125 new file mode 100755 |
| 126 index 0000000..77ec7bd |
| 127 --- /dev/null |
| 128 +++ b/png_read_filter_row_neon.S |
| 129 @@ -0,0 +1,1172 @@ |
| 130 +#if defined(__ARM_NEON__) |
| 131 +#; Copyright (c) 2010, Code Aurora Forum. All rights reserved. |
| 132 +#; |
| 133 +#; Redistribution and use in source and binary forms, with or without |
| 134 +#; modification, are permitted provided that the following conditions are |
| 135 +#; met: |
| 136 +#; * Redistributions of source code must retain the above copyright |
| 137 +#; notice, this list of conditions and the following disclaimer. |
| 138 +#; * Redistributions in binary form must reproduce the above |
| 139 +#; copyright notice, this list of conditions and the following |
| 140 +#; disclaimer in the documentation and/or other materials provided |
| 141 +#; with the distribution. |
| 142 +#; * Neither the name of Code Aurora Forum, Inc. nor the names of its |
| 143 +#; contributors may be used to endorse or promote products derived |
| 144 +#; from this software without specific prior written permission. |
| 145 +#; |
| 146 +#; THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED |
| 147 +#; WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF |
| 148 +#; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT |
| 149 +#; ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS |
| 150 +#; BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 151 +#; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 152 +#; SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
| 153 +#; BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
| 154 +#; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE |
| 155 +#; OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN |
| 156 +#; IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 157 + |
| 158 +#;=============================================================================
= |
| 159 + |
| 160 + .code 32 @; Code is ARM ISA |
| 161 +#;=============================================================================
= |
| 162 + |
| 163 + .global png_read_filter_row_neon |
| 164 + |
| 165 +#;=============================================================================
= |
| 166 +#; INPUTS: r0 rowbytes: number of bytes in current row |
| 167 +#; r1 pixel_depth: number of bits per pixel |
| 168 +#; r2 row: pointer to start of current row |
| 169 +#; r3 prev_row: pointer to start of previous row |
| 170 +#; [sp,#0] filter: filter type |
| 171 +#; |
| 172 +#; NOTE: Don't touch r5-r11 |
| 173 +#;=============================================================================
= |
| 174 +.balign 32 |
| 175 +.type png_read_filter_row_neon, %function |
| 176 +png_read_filter_row_neon: |
| 177 + |
| 178 + ldr r12,[sp,#0] |
| 179 + |
| 180 + cmp r12,#0 |
| 181 + beq DONE |
| 182 + |
| 183 + cmp r12,#1 |
| 184 + beq sub_filter |
| 185 + |
| 186 + cmp r12,#2 |
| 187 + beq up_filter |
| 188 + |
| 189 + cmp r12,#3 |
| 190 + beq avg_filter |
| 191 + |
| 192 + cmp r12,#4 |
| 193 + beq paeth_filter |
| 194 + |
| 195 + b DONE |
| 196 + |
| 197 + #;; --------------- |
| 198 + #;; SUB filter type |
| 199 + #;; --------------- |
| 200 + |
| 201 + |
| 202 +sub_filter: |
| 203 + |
| 204 + stmdb sp!, {r4} |
| 205 + |
| 206 + add r1,r1,#7 @; bpp = bytes per pixel |
| 207 + lsr r1,r1,#3 @; = (pixel_depth + 7) >> 3 |
| 208 + mov r12,r1 |
| 209 + |
| 210 + #;; r0 = rowbytes |
| 211 + #;; r1 = loop counter = bpp (initially) |
| 212 + #;; r2 = row pointer |
| 213 + #;; r12 = bpp = loop/pointer increment value |
| 214 + |
| 215 + cmp r1,r0 |
| 216 + beq sub_filter_exit @; exit if bpp == rowbytes |
| 217 + |
| 218 + cmp r12,#1 |
| 219 + beq sub_filter_1bpp |
| 220 + |
| 221 + cmp r12,#2 |
| 222 + beq sub_filter_2bpp |
| 223 + |
| 224 + cmp r12,#3 |
| 225 + beq sub_filter_3bpp |
| 226 + |
| 227 + cmp r12,#4 |
| 228 + beq sub_filter_4bpp |
| 229 + |
| 230 + cmp r12,#6 |
| 231 + beq sub_filter_6bpp |
| 232 + |
| 233 + cmp r12,#8 |
| 234 + beq sub_filter_8bpp |
| 235 + |
| 236 +sub_filter_exit: |
| 237 + b sub_filter_DONE @; return |
| 238 + |
| 239 + |
| 240 +sub_filter_1bpp: |
| 241 + |
| 242 + #;; ---------------------------- |
| 243 + #;; SUB filter, 1 byte per pixel |
| 244 + #;; ---------------------------- |
| 245 + |
| 246 + lsrs r4,r0,#4 @; r1 = floor(rowbytes/4) |
| 247 + @; = iteration count for loop
16 |
| 248 + beq sub_filter_1bpp_16bytes_done |
| 249 + |
| 250 + vmov.i8 d21, #0 |
| 251 + vld1.8 {d16,d17}, [r2] @; load 16 pixels |
| 252 + @; d16 = a b c d e f g h |
| 253 + @; d17 = i j k l m n o p |
| 254 + |
| 255 + mov r1, #0 |
| 256 +sub_filter_1bpp_16bytes: |
| 257 + |
| 258 + |
| 259 + |
| 260 + |
| 261 + vshl.i64 d18, d16, #8 @; d18 = 0 a b c d e f g |
| 262 + vadd.i8 d18, d16, d18 @; d18 = a a+b b+c c+d d+e e+f f
+g g+h |
| 263 + |
| 264 + vshl.i64 d18, d18, #8 @; d18 = 0 a a+b b+c c+d d+e e+f
f+g |
| 265 + vadd.i8 d18, d16, d18 @; d18 = a a+b a+b+c b+c+d c+d+e
d+e+f e+f+g f+g+h |
| 266 + |
| 267 + vshl.i64 d18, d18, #8 @; shift add continuously to pro
page the sum of previous |
| 268 + vadd.i8 d18, d16, d18 @; and current pixels |
| 269 + |
| 270 + vshl.i64 d18, d18, #8 |
| 271 + vadd.i8 d18, d16, d18 |
| 272 + |
| 273 + vshl.i64 d18, d18, #8 |
| 274 + vadd.i8 d18, d16, d18 |
| 275 + |
| 276 + vshl.i64 d18, d18, #8 |
| 277 + vadd.i8 d18, d16, d18 |
| 278 + |
| 279 + vshl.i64 d18, d18, #8 |
| 280 + vadd.i8 d18, d16, d18 @; maximum data size for shift i
s 64 bits i.e. doubleword. |
| 281 + @; after computing thh value of
all the pixels in the double word |
| 282 + @; extract the last computed val
ue which will be used by |
| 283 + @; the next set of pixels (i.e n
ext doubleword) |
| 284 + vext.8 d22, d18, d21, #7 @; extract the updated value of
d18[7] i.e a+b+c+d+e+f+h |
| 285 + vadd.i8 d17, d17, d22 @; d17 = a+b+c+d+e+f+g+h+i j k l
m n o p |
| 286 + |
| 287 + vshl.i64 d19, d17, #8 @; continue shift-add as the fir
st half |
| 288 + vadd.i8 d19, d17, d19 |
| 289 + |
| 290 + vshl.i64 d19, d19, #8 |
| 291 + vadd.i8 d19, d17, d19 |
| 292 + |
| 293 + vshl.i64 d19, d19, #8 |
| 294 + vadd.i8 d19, d17, d19 |
| 295 + |
| 296 + vshl.i64 d19, d19, #8 |
| 297 + vadd.i8 d19, d17, d19 |
| 298 + |
| 299 + vshl.i64 d19, d19, #8 |
| 300 + vadd.i8 d19, d17, d19 |
| 301 + |
| 302 + vshl.i64 d19, d19, #8 |
| 303 + vadd.i8 d19, d17, d19 |
| 304 + |
| 305 + vshl.i64 d19, d19, #8 |
| 306 + vadd.i8 d19, d17, d19 |
| 307 + |
| 308 + vst1.8 {d18,d19},[r2]! @; store the result back |
| 309 + |
| 310 + add r1, r1, #16 @; add 16 to the loop counter(n
o of bytes completed) |
| 311 + subs r4,r4,#1 @; decrement iteration count |
| 312 + beq sub_filter_1bpp_16bytes_adjust |
| 313 + |
| 314 + |
| 315 + vext.8 d22, d19, d21, #7 @; more iterations to go |
| 316 + @; extract the last computed va
lue |
| 317 + vld1.8 {d16,d17}, [r2] @; load the next 16 bytes |
| 318 + vadd.i8 d16, d16, d22 @; set up the input by adding t
he previous pixel |
| 319 + @; value to the input |
| 320 + b sub_filter_1bpp_16bytes |
| 321 + |
| 322 +sub_filter_1bpp_16bytes_adjust: |
| 323 + |
| 324 + cmp r1, r0 @; no more pixels left .. exit |
| 325 + sub r2, r2, #1 @; more pixels remaining |
| 326 + @; r2 points to the current pix
el adjust it |
| 327 + @; so that it points to the pre
v pixel for the below loop |
| 328 + beq sub_filter_DONE |
| 329 + |
| 330 +sub_filter_1bpp_16bytes_done: |
| 331 + |
| 332 + |
| 333 + vld1.8 {d0[0]},[r2]! @; load 1 byte (1 pixel) into D
0[0] |
| 334 + @; increment row pointer |
| 335 +sub_filter_1bpp_loop: |
| 336 + add r1,r1,r12 @; loop counter += bpp |
| 337 + cmp r1,r0 @; |
| 338 + |
| 339 + vld1.8 {d2[0]},[r2] @; load 1 byte (current pixel)
into D2[0] |
| 340 + |
| 341 + vadd.i8 d0,d0,d2 @; vector add 1 byte of previou
s pixel with |
| 342 + @; 1 byte of current
pixel |
| 343 + vst1.8 {d0[0]},[r2]! @; store 1 byte (updated pixel)
back |
| 344 + @; into row pointer location a
nd increment |
| 345 + @; row pointer |
| 346 + |
| 347 + bne sub_filter_1bpp_loop @; loop back until loop counter
== rowbytes |
| 348 + |
| 349 + b sub_filter_DONE @; return |
| 350 + |
| 351 + #;; ----------------------------- |
| 352 + #;; SUB filter, 2 bytes per pixel |
| 353 + #;; ----------------------------- |
| 354 +sub_filter_2bpp: |
| 355 + |
| 356 + lsrs r4,r0,#4 @; r1 = floor(rowbytes/4) |
| 357 + @; = iteration count for loo
p16 |
| 358 + beq sub_filter_2bpp_16bytes_done |
| 359 + |
| 360 + vmov.i8 d21, #0 |
| 361 + vld1.8 {d16,d17}, [r2] @; load 16 bytes to q8 |
| 362 + @; d16 = a b c d e f g h |
| 363 + @; d17 = i j k l m n o p |
| 364 + mov r1, #0 |
| 365 +sub_filter_2bpp_16bytes: |
| 366 + |
| 367 + vshl.i64 d18, d16, #16 @; each pixel is 2bytes .. shi
ft by 16 bits to get previous pixel |
| 368 + vadd.i8 d18, d16, d18 @; add to the current pixel |
| 369 + |
| 370 + vshl.i64 d18, d18, #16 @; shift-add to propagate the c
omputed sum as the case for 1bpp |
| 371 + vadd.i8 d18, d16, d18 |
| 372 + |
| 373 + vshl.i64 d18, d18, #16 |
| 374 + vadd.i8 d18, d16, d18 |
| 375 + |
| 376 + |
| 377 + vext.8 d22, d18, d21, #6 @; extract the last computed va
lue (i.e. last 2 bytes) |
| 378 + vadd.i8 d17, d17, d22 @; add the last computed pixel
to the input |
| 379 + |
| 380 + vshl.i64 d19, d17, #16 |
| 381 + vadd.i8 d19, d17, d19 |
| 382 + |
| 383 + vshl.i64 d19, d19, #16 |
| 384 + vadd.i8 d19, d17, d19 |
| 385 + |
| 386 + vshl.i64 d19, d19, #16 |
| 387 + vadd.i8 d19, d17, d19 |
| 388 + |
| 389 + |
| 390 + vst1.8 {d18,d19},[r2]! @; store the result back |
| 391 + |
| 392 + |
| 393 + add r1, r1, #16 @; add 16 to the loop counter(n
o of bytes completed) |
| 394 + subs r4,r4,#1 @; decrement iteration count |
| 395 + beq sub_filter_2bpp_16bytes_adjust |
| 396 + |
| 397 + |
| 398 + vext.8 d22, d19, d21, #6 @; extract the last computed va
lue |
| 399 + @; add the last computed pixel
to the input |
| 400 + vld1.8 {d16,d17}, [r2] |
| 401 + vadd.i8 d16, d16, d22 |
| 402 + |
| 403 + b sub_filter_2bpp_16bytes |
| 404 + |
| 405 + |
| 406 +sub_filter_2bpp_16bytes_adjust: |
| 407 + |
| 408 + cmp r1, r0 @; no more pixels left .. exit |
| 409 + sub r2, r2, #2 @; more pixels remaining |
| 410 + @; r2 points to the current pix
el adjust it |
| 411 + @; so that it points to the pre
v pixel for the below loop |
| 412 + beq sub_filter_DONE |
| 413 + |
| 414 +sub_filter_2bpp_16bytes_done: |
| 415 + |
| 416 + vld1.16 {d0[0]},[r2]! @; load 2 bytes (1 pixel) into
D0[0] |
| 417 + @; increment row pointer |
| 418 +sub_filter_2bpp_loop: |
| 419 + add r1,r1,r12 @; loop counter += bpp |
| 420 + cmp r1,r0 @; |
| 421 + |
| 422 + vld1.16 {d2[0]},[r2] @; load 2 bytes (current pixel)
into D2[0] |
| 423 + vadd.i8 d0,d0,d2 @; vector add 2 bytes of previo
us pixel with |
| 424 + @; 2 bytes of curren
t pixel |
| 425 + vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel
) back |
| 426 + @; into row pointer location a
nd increment |
| 427 + @; row pointer |
| 428 + |
| 429 + bne sub_filter_2bpp_loop @; loop back until loop counter
== rowbytes |
| 430 + @ |
| 431 + b sub_filter_DONE @ ; return |
| 432 + |
| 433 + #;; ----------------------------- |
| 434 + #;; SUB filter, 3 bytes per pixel |
| 435 + #;; ----------------------------- |
| 436 +sub_filter_3bpp: |
| 437 + vld1.32 {d0[0]},[r2], r12 @; load 4 bytes (1 pixel + 1 ex
tra byte) into D0[0] |
| 438 + @; increment row pointer by bpp |
| 439 +sub_filter_3bpp_loop: |
| 440 + add r1,r1,r12 @; loop counter += bpp |
| 441 + cmp r1,r0 @; |
| 442 + |
| 443 + vld1.32 {d2[0]},[r2] @; load 4 bytes (current pixel
+ 1 extra byte) into D2[0] |
| 444 + vadd.i8 d0,d0,d2 @; vector add 3 bytes of previo
us pixel with |
| 445 + @; 3 bytes of curren
t pixel |
| 446 + vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel
) back |
| 447 + @; into row pointer location a
nd increment |
| 448 + @; row pointer |
| 449 + vst1.8 {d0[2]},[r2]! @; store 1 byte (updated pixel)
back |
| 450 + @; into row pointer location a
nd increment |
| 451 + @; row pointer |
| 452 + |
| 453 + bne sub_filter_3bpp_loop @; loop back until loop counter
== rowbytes |
| 454 + |
| 455 + b sub_filter_DONE @; return |
| 456 + |
| 457 + #;; ----------------------------- |
| 458 + #;; SUB filter, 4 bytes per pixel |
| 459 + #;; ----------------------------- |
| 460 +sub_filter_4bpp: |
| 461 + vld1.32 {d0[0]},[r2]! @; load 4 bytes (1 pixel) into
D0[0] |
| 462 + @; increment row pointer |
| 463 +sub_filter_4bpp_loop: @ |
| 464 + add r1,r1,r12 @; loop counter += bpp |
| 465 + cmp r1,r0 @; |
| 466 + |
| 467 + |
| 468 + vld1.32 {d2[0]},[r2] @; load 4 bytes (current pixel)
into D2[0] |
| 469 + vadd.i8 d0,d0,d2 @; vector add 4 bytes of previo
us pixel with |
| 470 + @; 4 bytes of curren
t pixel |
| 471 + vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel
) back |
| 472 + @; into row pointer location a
nd increment |
| 473 + @; row pointer |
| 474 + |
| 475 + bne sub_filter_4bpp_loop @; loop back until loop counter
== rowbytes |
| 476 + |
| 477 + b sub_filter_DONE @; return |
| 478 + |
| 479 + #;; ----------------------------- |
| 480 + #;; SUB filter, 6 bytes per pixel |
| 481 + #;; ----------------------------- |
| 482 +sub_filter_6bpp: |
| 483 + vld1.8 {d0},[r2],r12 @; load 8 bytes (1 pixel + 2 ext
ra bytes) into D0 |
| 484 + @; increment row pointer by bpp |
| 485 +sub_filter_6bpp_loop: @ |
| 486 + add r1,r1,r12 @; loop counter += bpp |
| 487 + cmp r1,r0 @; |
| 488 + |
| 489 + vld1.8 {d2},[r2] @; load 8 bytes (1 pixel + 2 ext
ra bytes) into D2 |
| 490 + vadd.i8 d0,d0,d2 @; vector add 6 bytes of previou
s pixel with |
| 491 + @; 6 bytes of current
pixel |
| 492 + vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel)
back |
| 493 + @; into row pointer location an
d increment |
| 494 + @; row pointer |
| 495 + vst1.16 {d0[2]},[r2]! @; store 2 bytes (updated pixel)
back |
| 496 + @; into row pointer location an
d increment |
| 497 + @; row pointer |
| 498 + |
| 499 + bne sub_filter_6bpp_loop @; loop back until loop counter
== rowbytes |
| 500 + |
| 501 + b sub_filter_DONE @; return |
| 502 + |
| 503 + #;; ----------------------------- |
| 504 + #;; SUB filter, 8 bytes per pixel |
| 505 + #;; ----------------------------- |
| 506 +sub_filter_8bpp: |
| 507 + vld1.8 {d0},[r2]! @; load 8 bytes (1 pixel) into D
0 |
| 508 + @; increment row pointer |
| 509 +sub_filter_8bpp_loop: @ |
| 510 + add r1,r1,r12 @; loop counter += bpp |
| 511 + cmp r1,r0 @; |
| 512 + vld1.8 {d2},[r2] @; load 8 bytes (current pixel)
into D2 |
| 513 + vadd.i8 d0,d0,d2 @; vector add 8 bytes of previou
s pixel with |
| 514 + @; 8 bytes of current
pixel |
| 515 + vst1.8 {d0},[r2]! @; store 8 bytes (updated pixel)
back |
| 516 + @; into row pointer location an
d increment |
| 517 + @; row pointer |
| 518 + |
| 519 + |
| 520 + bne sub_filter_8bpp_loop @; loop back until loop counter
== rowbytes |
| 521 + @ |
| 522 + b sub_filter_DONE @ ; return |
| 523 + |
| 524 +sub_filter_DONE: |
| 525 + |
| 526 + ldmia sp!, {r4} |
| 527 + bx r14 |
| 528 + |
| 529 + #;; -------------- |
| 530 + #;; UP filter type |
| 531 + #;; -------------- |
| 532 +up_filter: |
| 533 + |
| 534 + #;; r0 = rowbytes |
| 535 + #;; r1 = pixel_depth (not required for UP filter type) |
| 536 + #;; r2 = row pointer |
| 537 + #;; r3 = previous row pointer |
| 538 + |
| 539 + |
| 540 + lsrs r1,r0,#5 @; r1 = floor(rowbytes/32) |
| 541 + @; = iteration count for loop
32 |
| 542 + beq up_filter_32bytes_proc_done |
| 543 + |
| 544 + |
| 545 +up_filter_32bytes_proc: |
| 546 + |
| 547 + |
| 548 + mov r12, r2 |
| 549 + |
| 550 + vld1.8 {q0},[r3]! @; load 32 bytes from previous |
| 551 + vld1.8 {q2},[r3]! @; row and increment pointer |
| 552 + @ |
| 553 + @ |
| 554 + vld1.8 {q1},[r12]! @; load 32 bytes from current ro
w |
| 555 + vld1.8 {q3},[r12]! @ |
| 556 + @ |
| 557 + @ |
| 558 + @ |
| 559 + vadd.i8 q0,q0,q1 @; vector add of 16 bytes |
| 560 + vadd.i8 q2,q2,q3 @ |
| 561 + @ |
| 562 + @ |
| 563 + @ |
| 564 + vst1.8 {q0},[r2]! @; store 32 bytes to current row |
| 565 + vst1.8 {q2},[r2]! @ |
| 566 + @; and increment pointer |
| 567 + sub r0,r0,#32 @; subtract 32 from rowbytes |
| 568 + subs r1,r1,#1 @; decrement iteration count |
| 569 + bne up_filter_32bytes_proc |
| 570 + |
| 571 + |
| 572 + |
| 573 +up_filter_32bytes_proc_done: |
| 574 + |
| 575 + lsrs r1,r0,#4 @; r1 = floor(rowbytes/16) |
| 576 + @; = iteration count for loop
16 |
| 577 + beq up_filter_16bytes_proc_done |
| 578 + |
| 579 +up_filter_16bytes_proc: |
| 580 + |
| 581 + vld1.8 {q0},[r3]! @; load 16 bytes from previous |
| 582 + @; row and increment pointer |
| 583 + vld1.8 {q1},[r2] @; load 16 bytes from current ro
w |
| 584 + vadd.i8 q0,q0,q1 @; vector add of 16 bytes |
| 585 + vst1.8 {q0},[r2]! @; store 16 bytes to current row |
| 586 + @; and increment pointer |
| 587 + sub r0,r0,#16 @; subtract 16 from rowbytes |
| 588 + subs r1,r1,#1 @; decrement iteration count |
| 589 + bne up_filter_16bytes_proc |
| 590 + |
| 591 +up_filter_16bytes_proc_done: |
| 592 + |
| 593 + lsrs r1,r0,#3 @; r1 = floor(rowbytes/8) |
| 594 + beq up_filter_8bytes_proc_done |
| 595 + |
| 596 +up_filter_8bytes_proc: |
| 597 + |
| 598 + vld1.8 {d0},[r3]! @; load 8 bytes from previous |
| 599 + @; row and increment pointer |
| 600 + vld1.8 {d2},[r2] @; load 8 bytes from current row |
| 601 + vadd.i8 d0,d0,d2 @; vector add 8 bytes |
| 602 + vst1.8 {d0},[r2]! @; store 8 bytes to current row |
| 603 + @; and increment pointer |
| 604 + sub r0,r0,#8 @; subtract 8 from rowbytes |
| 605 + |
| 606 +up_filter_8bytes_proc_done: |
| 607 + |
| 608 + lsrs r1,r0,#2 @; r1 = floor(rowbytes/4) |
| 609 + beq up_filter_4bytes_proc_done |
| 610 + |
| 611 +up_filter_4bytes_proc: |
| 612 + |
| 613 + vld1.32 {d0[0]},[r3]! @; load 4 bytes from previous ro
w |
| 614 + @; and increment pointer |
| 615 + vld1.32 {d2[0]},[r2] @; load 4 bytes from current row |
| 616 + vadd.i8 d0,d0,d2 @; vector add 4 bytes |
| 617 + vst1.32 {d0[0]},[r2]! @; store 4 bytes to current row |
| 618 + @; and increment pointer |
| 619 + sub r0,r0,#4 @; subtract 4 from rowbytes |
| 620 + |
| 621 +up_filter_4bytes_proc_done: |
| 622 + |
| 623 + lsrs r1,r0,#1 @; r1 = floor(rowbytes/2) |
| 624 + beq up_filter_2bytes_proc_done |
| 625 + |
| 626 +up_filter_2bytes_proc: |
| 627 + |
| 628 + vld1.16 {d0[0]},[r3]! @; load 2 bytes from previous ro
w |
| 629 + @; and increment pointer |
| 630 + vld1.16 {d2[0]},[r2] @; load 2 bytes from current row |
| 631 + vadd.i8 d0,d0,d2 @; vector add 2 bytes |
| 632 + vst1.16 {d0[0]},[r2]! @; store 2 bytes to current row |
| 633 + @; and increment pointer |
| 634 + sub r0,r0,#2 @; subtract 2 from rowbytes |
| 635 + |
| 636 +up_filter_2bytes_proc_done: |
| 637 + |
| 638 + cmp r0,#0 |
| 639 + beq up_filter_1byte_proc_done |
| 640 + |
| 641 +up_filter_1byte_proc: |
| 642 + |
| 643 + vld1.8 {d0[0]},[r3]! @; load 1 byte from previous row |
| 644 + @; and increment pointer |
| 645 + vld1.8 {d2[0]},[r2] @; load 1 byte from current row |
| 646 + vadd.i8 d0,d0,d2 @; vector add 1 byte |
| 647 + vst1.8 {d0[0]},[r2]! @; store 1 byte to current row |
| 648 + @; and increment pointer |
| 649 +up_filter_1byte_proc_done: |
| 650 + |
| 651 + b DONE |
| 652 + |
| 653 + #;; --------------- |
| 654 + #;; AVG filter type |
| 655 + #;; --------------- |
| 656 +avg_filter: |
| 657 + |
| 658 + add r1,r1,#7 @; bpp = byptes per pixel |
| 659 + lsr r1,r1,#3 @; = (pixel_depth + 7) >> 3 |
| 660 + mov r12,r1 |
| 661 + |
| 662 + #;; r0 = rowbytes |
| 663 + #;; r1 = loop counter = bpp (initially) |
| 664 + #;; r2 = row pointer |
| 665 + #;; r3 = previous row pointer |
| 666 + #;; r12 = bpp = loop/pointer increment value |
| 667 + |
| 668 + cmp r12,#1 |
| 669 + beq avg_filter_1bpp |
| 670 + |
| 671 + cmp r12,#2 |
| 672 + beq avg_filter_2bpp |
| 673 + |
| 674 + cmp r12,#3 |
| 675 + beq avg_filter_3bpp |
| 676 + |
| 677 + cmp r12,#4 |
| 678 + beq avg_filter_4bpp |
| 679 + |
| 680 + cmp r12,#6 |
| 681 + beq avg_filter_6bpp |
| 682 + |
| 683 + cmp r12,#8 |
| 684 + beq avg_filter_8bpp |
| 685 + |
| 686 +avg_filter_exit: |
| 687 + b DONE @; return |
| 688 + |
| 689 + #;; ---------------------------- |
| 690 + #;; AVG filter, 1 byte per pixel |
| 691 + #;; ---------------------------- |
| 692 +avg_filter_1bpp: |
| 693 + |
| 694 + cmp r1,r0 |
| 695 + |
| 696 + vld1.8 {d0[0]},[r2] @; load 1 byte (pixel x) from c
urr |
| 697 + @; row into d0[0] |
| 698 + vld1.8 {d1[0]},[r3]! @; load 1 byte (pixel b) from p
rev |
| 699 + @; row into d1[0] |
| 700 + @; increment prev row pointer |
| 701 + vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and
add |
| 702 + @; to pixel x |
| 703 + vst1.8 {d0[0]},[r2]! @; store 1 byte (updated pixel
x) |
| 704 + @; increment curr row pointer |
| 705 + @; updated pixel x is now pixel
a |
| 706 + beq DONE |
| 707 + |
| 708 +avg_filter_1bpp_loop: |
| 709 + add r1,r1,r12 @; loop counter += bpp |
| 710 + cmp r1,r0 |
| 711 + |
| 712 + |
| 713 + vld1.8 {d2[0]},[r2] @; load 1 byte (pixel x) from c
urr |
| 714 + @; row into d2[0] |
| 715 + vld1.8 {d1[0]},[r3]! @; load 1 byte (pixel b) from p
rev |
| 716 + @; row into d1[0] |
| 717 + vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b) |
| 718 + vshrn.i16 d1,q2,#1 @; d1[0] = (a + b)/2 |
| 719 + vadd.i8 d0,d2,d1 @; d0[0] = x + ((a + b)/2) |
| 720 + vst1.8 {d0[0]},[r2]! @; store 1 byte (updated pixel
x) |
| 721 + @; increment curr row pointer |
| 722 + bne avg_filter_1bpp_loop |
| 723 + |
| 724 + b DONE @; exit loop when |
| 725 + @; loop counter == rowbytes |
| 726 + #;; ----------------------------- |
| 727 + #;; AVG filter, 2 bytes per pixel |
| 728 + #;; ----------------------------- |
| 729 +avg_filter_2bpp: |
| 730 + |
| 731 + cmp r1,r0 |
| 732 + |
| 733 + vld1.16 {d0[0]},[r2] @; load 2 bytes (pixel x) from
curr |
| 734 + @; row into d0[0] |
| 735 + vld1.16 {d1[0]},[r3]! @; load 2 bytes (pixel b) from
prev |
| 736 + @; row into d1[0] |
| 737 + @; increment prev row pointer |
| 738 + vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and
add |
| 739 + @; to pixel x |
| 740 + vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel
x) |
| 741 + @; increment curr row pointer |
| 742 + @; updated pixel x is now pixel
a |
| 743 + beq DONE |
| 744 + |
| 745 +avg_filter_2bpp_loop: |
| 746 + add r1,r1,r12 @; loop counter += bpp |
| 747 + cmp r1,r0 |
| 748 + |
| 749 + |
| 750 + vld1.16 {d2[0]},[r2] @; load 2 bytes (pixel x) from
curr |
| 751 + @; row into d2[0] |
| 752 + vld1.16 {d1[0]},[r3]! @; load 2 bytes (pixel b) from
prev |
| 753 + @; row into d1[0] |
| 754 + vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b) |
| 755 + vshrn.i16 d1,q2,#1 @; d1[0] = (a + b)/2 |
| 756 + vadd.i8 d0,d2,d1 @; d0[0] = x + ((a + b)/2) |
| 757 + vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel
x) |
| 758 + @; increment curr row pointer |
| 759 + |
| 760 + bne avg_filter_2bpp_loop |
| 761 + |
| 762 + b DONE @; exit loop when |
| 763 + @; loop counter == rowbytes |
| 764 + |
| 765 + #;; ----------------------------- |
| 766 + #;; AVG filter, 3 bytes per pixel |
| 767 + #;; ----------------------------- |
| 768 +avg_filter_3bpp: |
| 769 + |
| 770 + cmp r1,r0 |
| 771 + |
| 772 + vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x + 1 ex
tra |
| 773 + @; byte) from curr row into d0
[0] |
| 774 + vld1.32 {d1[0]},[r3],r12 @; load 4 bytes (pixel b + 1 ex
tra |
| 775 + @; byte) from prev row into d1
[0] |
| 776 + @; increment prev row pointer |
| 777 + vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and
add |
| 778 + @; to pixel x |
| 779 + vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel
x) |
| 780 + @; increment curr row pointer |
| 781 + vst1.8 {d0[2]},[r2]! @; store 1 byte (updated pixel
x) |
| 782 + @; increment curr row pointer |
| 783 + @; updated pixel x is now pixel
a |
| 784 + beq DONE |
| 785 + |
| 786 +avg_filter_3bpp_loop: |
| 787 + add r1,r1,r12 @; loop counter += bpp |
| 788 + cmp r1,r0 |
| 789 + |
| 790 + vld1.32 {d2[0]},[r2] @; load 4 bytes (pixel x + 1 ex
tra |
| 791 + @; byte) from curr row into d2
[0] |
| 792 + vld1.32 {d1[0]},[r3],r12 @; load 4 bytes (pixel b + 1 ex
tra |
| 793 + @; byte) from prev row into d1
[0] |
| 794 + vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b) |
| 795 + vshrn.i16 d1,q2,#1 @; d1[0] = (a + b)/2 |
| 796 + vadd.i8 d0,d2,d1 @; d0[0] = x + ((a + b)/2) |
| 797 + vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel
x) |
| 798 + @; increment curr row pointer |
| 799 + vst1.8 {d0[2]},[r2]! @; store 1 byte (updated pixel
x) |
| 800 + @; increment curr row pointer |
| 801 + |
| 802 + bne avg_filter_3bpp_loop |
| 803 + |
| 804 + b DONE @; exit loop when |
| 805 + @; loop counter == rowbytes |
| 806 + #;; ----------------------------- |
| 807 + #;; AVG filter, 4 bytes per pixel |
| 808 + #;; ----------------------------- |
| 809 +avg_filter_4bpp: |
| 810 + |
| 811 + cmp r1,r0 |
| 812 + |
| 813 + vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x) from
curr |
| 814 + @; row into d0[0] |
| 815 + vld1.32 {d1[0]},[r3]! @; load 4 bytes (pixel b) from
prev |
| 816 + @; row into d1[0] |
| 817 + @; increment prev row pointer |
| 818 + vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and
add |
| 819 + @; to pixel x |
| 820 + vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel
x) |
| 821 + @; increment curr row pointer |
| 822 + @; updated pixel x is now pixel
a |
| 823 + beq DONE |
| 824 + |
| 825 +avg_filter_4bpp_loop: |
| 826 + add r1,r1,r12 @; loop counter += bpp |
| 827 + cmp r1,r0 |
| 828 + |
| 829 + |
| 830 + vld1.32 {d2[0]},[r2] @; load 4 bytes (pixel x) from
curr |
| 831 + @; row into d2[0] |
| 832 + vld1.32 {d1[0]},[r3]! @; load 4 bytes (pixel b) from
prev |
| 833 + @; row into d1[0] |
| 834 + vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b) |
| 835 + vshrn.i16 d1,q2,#1 @; d1[0] = (a + b)/2 |
| 836 + vadd.i8 d0,d2,d1 @; d0[0] = x + ((a + b)/2) |
| 837 + vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel
x) |
| 838 + @; increment curr row pointer |
| 839 + bne avg_filter_4bpp_loop |
| 840 + |
| 841 + b DONE @; exit loop when |
| 842 + @; loop counter == rowbytes |
| 843 + #;; ----------------------------- |
| 844 + #;; AVG filter, 6 bytes per pixel |
| 845 + #;; ----------------------------- |
| 846 +avg_filter_6bpp: |
| 847 + |
| 848 + cmp r1,r0 |
| 849 + |
| 850 + vld1.8 {d0},[r2] @; load 8 bytes (pixel x + 2 ex
tra |
| 851 + @; bytes) from curr row into d
0 |
| 852 + vld1.8 {d1},[r3],r12 @; load 8 bytes (pixel b + 2 ex
tra |
| 853 + @; bytes) from prev row into d
1 |
| 854 + @; increment prev row pointer |
| 855 + vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and
add |
| 856 + @; to pixel x |
| 857 + vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel
x) |
| 858 + @; increment curr row pointer |
| 859 + @; updated pixel x is now pixel
a |
| 860 + vst1.16 {d0[2]},[r2]! @; store 2 bytes (updated pixel
x) |
| 861 + @; increment curr row pointer |
| 862 + @; updated pixel x is now pixel
a |
| 863 + beq DONE |
| 864 + |
| 865 +avg_filter_6bpp_loop: |
| 866 + add r1,r1,r12 @; loop counter += bpp |
| 867 + cmp r1,r0 |
| 868 + |
| 869 + |
| 870 + vld1.8 {d2},[r2] @; load 8 bytes (pixel x + 2 ex
tra |
| 871 + @; bytes) from curr row into d
2 |
| 872 + vld1.8 {d1},[r3],r12 @; load 8 bytes (pixel b + 2 ex
tra |
| 873 + @; bytes) from prev row into d
1 |
| 874 + vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b) |
| 875 + vshrn.i16 d1,q2,#1 @; d1 = (a + b)/2 |
| 876 + vadd.i8 d0,d2,d1 @; d0 = x + ((a + b)/2) |
| 877 + vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel
x) |
| 878 + @; increment curr row pointer |
| 879 + vst1.16 {d0[2]},[r2]! @; store 2 bytes (updated pixel
x) |
| 880 + @; increment curr row pointer |
| 881 + bne avg_filter_6bpp_loop |
| 882 + |
| 883 + b DONE @; exit loop when |
| 884 + @; loop counter == rowbytes |
| 885 + #;; ----------------------------- |
| 886 + #;; AVG filter, 8 bytes per pixel |
| 887 + #;; ----------------------------- |
| 888 +avg_filter_8bpp: |
| 889 + |
| 890 + cmp r1,r0 |
| 891 + |
| 892 + vld1.8 {d0},[r2] @; load 8 bytes (pixel x) from
curr |
| 893 + @; row into d0 |
| 894 + vld1.8 {d1},[r3]! @; load 8 bytes (pixel b) from
prev |
| 895 + @; row into d1 |
| 896 + @; increment prev row pointer |
| 897 + vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and
add |
| 898 + @; to pixel x |
| 899 + vst1.8 {d0},[r2]! @; store 8 bytes (updated pixel
x) |
| 900 + @; increment curr row pointer |
| 901 + @; updated pixel x is now pixel
a |
| 902 + beq DONE |
| 903 +avg_filter_8bpp_loop: |
| 904 + add r1,r1,r12 @; loop counter += bpp |
| 905 + cmp r1,r0 |
| 906 + |
| 907 + |
| 908 + vld1.8 {d2},[r2] @; load 8 bytes (pixel x) from
curr |
| 909 + @; row into d2 |
| 910 + vld1.8 {d1},[r3]! @; load 8 bytes (pixel b) from
prev |
| 911 + @; row into d1 |
| 912 + vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b) |
| 913 + vshrn.i16 d1,q2,#1 @; d1 = (a + b)/2 |
| 914 + vadd.i8 d0,d2,d1 @; d0 = x + ((a + b)/2) |
| 915 + vst1.8 {d0},[r2]! @; store 8 bytes (updated pixel
x) |
| 916 + @; increment curr row pointer |
| 917 + bne avg_filter_8bpp_loop |
| 918 + |
| 919 + b DONE @; exit loop when |
| 920 + @; loop counter == rowbytes |
| 921 + #;; ----------------- |
| 922 + #;; PAETH filter type |
| 923 + #;; ----------------- |
| 924 +paeth_filter: |
| 925 + |
| 926 + VPUSH {q4-q7} |
| 927 + add r1,r1,#7 @; bpp = bytes per pixel |
| 928 + lsr r1,r1,#3 @; = (pixel_depth + 7) >> 3 |
| 929 + mov r12,r1 |
| 930 + |
| 931 + #;; r0 = rowbytes |
| 932 + #;; r1 = loop counter = bpp (initially) |
| 933 + #;; r2 = row pointer |
| 934 + #;; r3 = previous row pointer |
| 935 + #;; r12 = bpp = loop/pointer increment value |
| 936 + |
| 937 + |
| 938 + cmp r12,#1 |
| 939 + beq paeth_filter_1bpp |
| 940 + |
| 941 + cmp r12,#2 |
| 942 + beq paeth_filter_2bpp |
| 943 + |
| 944 + cmp r12,#3 |
| 945 + beq paeth_filter_3bpp |
| 946 + |
| 947 + cmp r12,#4 |
| 948 + beq paeth_filter_4bpp |
| 949 + |
| 950 + cmp r12,#6 |
| 951 + beq paeth_filter_6bpp |
| 952 + |
| 953 + cmp r12,#8 |
| 954 + beq paeth_filter_8bpp |
| 955 + |
| 956 +paeth_filter_exit: |
| 957 + b paeth_filter_DONE @; return |
| 958 + |
| 959 + #;; ------------------------------ |
| 960 + #;; PAETH filter, 1 byte per pixel |
| 961 + #;; ------------------------------ |
| 962 +paeth_filter_1bpp: |
| 963 + |
| 964 + cmp r1, r0 |
| 965 + |
| 966 + vld1.8 {d0[0]},[r2] @; load 1 byte (pixel x) from c
urr |
| 967 + @; row into d0[0] |
| 968 + vld1.8 {d1[0]},[r3]! @; load 1 byte (pixel b) from p
rev |
| 969 + @; row into d1[0] |
| 970 + @; increment prev row pointer |
| 971 + vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x |
| 972 + vst1.8 {d2[0]},[r2]! @; store 1 byte (updated pixel
x) |
| 973 + @; increment curr row pointer |
| 974 + |
| 975 + beq paeth_filter_DONE |
| 976 + |
| 977 +paeth_filter_1bpp_loop: |
| 978 + add r1,r1,r12 @; increment curr row pointer |
| 979 + cmp r1,r0 |
| 980 + |
| 981 + |
| 982 + #;; d1[0] = c (b in the previous loop iteration) |
| 983 + #;; d2[0] = a (x in the previous loop iteration) |
| 984 + vld1.8 {d3[0]},[r3]! @; load 1 byte (pixel b) from p
rev |
| 985 + @; row into d3[0] |
| 986 + vld1.8 {d0[0]},[r2] @; load 1 byte (pixel x) from c
urr |
| 987 + @; row into d0[0] |
| 988 + vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c |
| 989 + vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c) |
| 990 + vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c) |
| 991 + vaddl.u8 q5,d2,d3 @; q5 = a + b |
| 992 + vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c) |
| 993 + |
| 994 + vcle.s16 q5,q2,q3 @; q5 = (pa <= pb) |
| 995 + vcle.s16 q6,q2,q4 @; q6 = (pa <= pc) |
| 996 + vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= p
c)) |
| 997 + vcle.s16 q7,q3,q4 @; q7 = (pb <= pc) |
| 998 + vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <=
pc)) |
| 999 + vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc) |
| 1000 + @ |
| 1001 + vand d2,d2,d10 @; d2 = a where 1, 0 where 0 |
| 1002 + vbsl d14,d3,d1 @; d14 = b where 1, c where 0 |
| 1003 + vmvn d10,d10 @; invert d10 |
| 1004 + vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0 |
| 1005 + vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropr
iate |
| 1006 + vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x) |
| 1007 + vmov d1,d3 @; d1 = b (c for next iteration
) |
| 1008 + vst1.8 {d2[0]},[r2]! @; store 1 byte (updated pixel
x) |
| 1009 + |
| 1010 + |
| 1011 + bne paeth_filter_1bpp_loop |
| 1012 + |
| 1013 + b paeth_filter_DONE @; exit loop when |
| 1014 + @; loop counter == rowbytes |
| 1015 + #;; ------------------------------- |
| 1016 + #;; PAETH filter, 2 bytes per pixel |
| 1017 + #;; ------------------------------- |
| 1018 +paeth_filter_2bpp: |
| 1019 + |
| 1020 + cmp r1, r0 |
| 1021 + |
| 1022 + vld1.16 {d0[0]},[r2] @; load 2 bytes (pixel x) from
curr |
| 1023 + @; row into d0[0] |
| 1024 + vld1.16 {d1[0]},[r3]! @; load 2 bytes (pixel b) from
prev |
| 1025 + @; row into d1[0] |
| 1026 + @; increment prev row pointer |
| 1027 + vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x |
| 1028 + vst1.16 {d2[0]},[r2]! @; store 2 bytes (updated pixel
x) |
| 1029 + @; increment curr row pointer |
| 1030 + beq paeth_filter_DONE |
| 1031 + |
| 1032 +paeth_filter_2bpp_loop: |
| 1033 + add r1,r1,r12 @; loop counter += bpp |
| 1034 + cmp r1,r0 |
| 1035 + |
| 1036 + #;; d1[0] = c (b in the previous loop iteration) |
| 1037 + #;; d2[0] = a (x in the previous loop iteration) |
| 1038 + vld1.16 {d3[0]},[r3]! @; load 2 bytes (pixel b) from
prev |
| 1039 + @; row into d3[0] |
| 1040 + vld1.16 {d0[0]},[r2] @; load 2 bytes (pixel x) from
curr |
| 1041 + @; row into d0[0] |
| 1042 + vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c |
| 1043 + vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c) |
| 1044 + vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c) |
| 1045 + vaddl.u8 q5,d2,d3 @; q5 = a + b |
| 1046 + vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c) |
| 1047 + |
| 1048 + vcle.s16 q5,q2,q3 @; q5 = (pa <= pb) |
| 1049 + vcle.s16 q6,q2,q4 @; q6 = (pa <= pc) |
| 1050 + vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= p
c)) |
| 1051 + vcle.s16 q7,q3,q4 @; q7 = (pb <= pc) |
| 1052 + vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <=
pc)) |
| 1053 + vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc) |
| 1054 + |
| 1055 + vand d2,d2,d10 @; d2 = a where 1, 0 where 0 |
| 1056 + vbsl d14,d3,d1 @; d14 = b where 1, c where 0 |
| 1057 + vmvn d10,d10 @; invert d10 |
| 1058 + vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0 |
| 1059 + vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropr
iate |
| 1060 + vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x) |
| 1061 + vmov d1,d3 @; d1 = b (c for next iteration
) |
| 1062 + vst1.16 {d2[0]},[r2]! @; store 2 bytes (updated pixel
x) |
| 1063 + @; increment curr row pointer |
| 1064 + bne paeth_filter_2bpp_loop |
| 1065 + |
| 1066 + b paeth_filter_DONE @; exit loop when |
| 1067 + @; loop counter == rowbytes |
| 1068 + #;; ------------------------------- |
| 1069 + #;; PAETH filter, 3 bytes per pixel |
| 1070 + #;; ------------------------------- |
| 1071 +paeth_filter_3bpp: |
| 1072 + |
| 1073 + cmp r1, r0 |
| 1074 + |
| 1075 + vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x + 1 ex
tra |
| 1076 + @; byte) from curr row into d0
[0] |
| 1077 + vld1.32 {d1[0]},[r3],r12 @; load 4 bytes (pixel b + 1 ex
tra |
| 1078 + @; byte) from prev row into d1
[0] |
| 1079 + @; increment prev row pointer |
| 1080 + vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x |
| 1081 + vst1.16 {d2[0]},[r2]! @; store 2 bytes (updated pixel
x) |
| 1082 + @; increment curr row pointer |
| 1083 + vst1.8 {d2[2]},[r2]! @; store 1 byte (updated pixel
x) |
| 1084 + @; increment curr row pointer |
| 1085 + beq paeth_filter_DONE |
| 1086 + |
| 1087 +paeth_filter_3bpp_loop: |
| 1088 + add r1,r1,r12 @; loop counter += bpp |
| 1089 + cmp r1,r0 |
| 1090 + |
| 1091 + |
| 1092 + #;; d1[0] = c (b in the previous loop iteration) |
| 1093 + #;; d2[0] = a (x in the previous loop iteration) |
| 1094 + vld1.32 {d3[0]},[r3],r12 @; load 4 bytes (pixel b + 1 ex
tra |
| 1095 + @; byte) from prev row into d3
[0] |
| 1096 + vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x + 1 ex
tra |
| 1097 + @; byte) from curr row into d0
[0] |
| 1098 + vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c |
| 1099 + vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c) |
| 1100 + vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c) |
| 1101 + vaddl.u8 q5,d2,d3 @; q5 = a + b |
| 1102 + vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c) |
| 1103 + @ |
| 1104 + vcle.s16 q5,q2,q3 @; q5 = (pa <= pb) |
| 1105 + vcle.s16 q6,q2,q4 @; q6 = (pa <= pc) |
| 1106 + vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= p
c)) |
| 1107 + vcle.s16 q7,q3,q4 @; q7 = (pb <= pc) |
| 1108 + vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <=
pc)) |
| 1109 + vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc) |
| 1110 + @ |
| 1111 + vand d2,d2,d10 @; d2 = a where 1, 0 where 0 |
| 1112 + vbsl d14,d3,d1 @; d14 = b where 1, c where 0 |
| 1113 + vmvn d10,d10 @; invert d10 |
| 1114 + vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0 |
| 1115 + vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropr
iate |
| 1116 + vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x) |
| 1117 + vmov d1,d3 @; d1 = b (c for next iteration
) |
| 1118 + vst1.16 {d2[0]},[r2]! @; store 2 bytes (updated pixel
x) |
| 1119 + @; increment curr row pointer |
| 1120 + vst1.8 {d2[2]},[r2]! @; store 1 byte (updated pixel
x) |
| 1121 + @; increment curr row pointer |
| 1122 + bne paeth_filter_3bpp_loop |
| 1123 + |
| 1124 + b paeth_filter_DONE @; exit loop when |
| 1125 + @; loop counter == rowbytes |
| 1126 + #;; ------------------------------- |
| 1127 + #;; PAETH filter, 4 bytes per pixel |
| 1128 + #;; ------------------------------- |
| 1129 +paeth_filter_4bpp: |
| 1130 + |
| 1131 + cmp r1, r0 |
| 1132 + |
| 1133 + vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x) from
curr |
| 1134 + @; row into d0[0] |
| 1135 + vld1.32 {d1[0]},[r3]! @; load 4 bytes (pixel b) from
prev |
| 1136 + @; row into d1[0] |
| 1137 + @; increment prev row pointer |
| 1138 + vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x |
| 1139 + vst1.32 {d2[0]},[r2]! @; store 4 bytes (updated pixel
x) |
| 1140 + @; increment curr row pointer |
| 1141 + beq paeth_filter_DONE |
| 1142 + |
| 1143 +paeth_filter_4bpp_loop: |
| 1144 + add r1,r1,r12 @; loop counter += bpp |
| 1145 + cmp r1,r0 |
| 1146 + |
| 1147 + |
| 1148 + #;; d1[0] = c (b in the previous loop iteration) |
| 1149 + #;; d2[0] = a (x in the previous loop iteration) |
| 1150 + vld1.32 {d3[0]},[r3]! @; load 4 bytes (pixel b) from
prev |
| 1151 + @; row into d3[0] |
| 1152 + vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x) from
curr |
| 1153 + @; row into d0[0] |
| 1154 + vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c |
| 1155 + vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c) |
| 1156 + vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c) |
| 1157 + vaddl.u8 q5,d2,d3 @; q5 = a + b |
| 1158 + vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c) |
| 1159 + @ |
| 1160 + vcle.s16 q5,q2,q3 @; q5 = (pa <= pb) |
| 1161 + vcle.s16 q6,q2,q4 @; q6 = (pa <= pc) |
| 1162 + vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= p
c)) |
| 1163 + vcle.s16 q7,q3,q4 @; q7 = (pb <= pc) |
| 1164 + vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <=
pc)) |
| 1165 + vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc) |
| 1166 + @ |
| 1167 + vand d2,d2,d10 @; d2 = a where 1, 0 where 0 |
| 1168 + vbsl d14,d3,d1 @; d14 = b where 1, c where 0 |
| 1169 + vmvn d10,d10 @; invert d10 |
| 1170 + vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0 |
| 1171 + vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropr
iate |
| 1172 + vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x) |
| 1173 + vmov d1,d3 @; d1 = b (c for next iteration
) |
| 1174 + vst1.32 {d2[0]},[r2]! @; store 4 bytes (updated pixel
x) |
| 1175 + @; increment curr row pointer |
| 1176 + bne paeth_filter_4bpp_loop |
| 1177 + |
| 1178 + b paeth_filter_DONE @; exit loop when |
| 1179 + @; loop counter == rowbytes |
| 1180 + #;; ------------------------------- |
| 1181 + #;; PAETH filter, 6 bytes per pixel |
| 1182 + #;; ------------------------------- |
| 1183 +paeth_filter_6bpp: |
| 1184 + cmp r1, r0 |
| 1185 + |
| 1186 + vld1.8 {d0},[r2] @; load 8 bytes (pixel x + 2 ex
tra |
| 1187 + @; bytes) from curr row into d
0 |
| 1188 + vld1.8 {d1},[r3],r12 @; load 8 bytes (pixel b + 2 ex
tra |
| 1189 + @; bytes) from prev row into d
1 |
| 1190 + @; increment prev row pointer |
| 1191 + vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x |
| 1192 + vst1.32 {d2[0]},[r2]! @; store 4 bytes (updated pixel
x) |
| 1193 + @; increment curr row pointer |
| 1194 + vst1.16 {d2[2]},[r2]! @; store 2 bytes (updated pixel
x) |
| 1195 + @; increment curr row pointer |
| 1196 + beq paeth_filter_DONE |
| 1197 + |
| 1198 +paeth_filter_6bpp_loop: |
| 1199 + add r1,r1,r12 @; loop counter += bpp |
| 1200 + cmp r1,r0 |
| 1201 + |
| 1202 + |
| 1203 + #;; d1[0] = c (b in the previous loop iteration) |
| 1204 + #;; d2[0] = a (x in the previous loop iteration) |
| 1205 + vld1.8 {d3},[r3],r12 @; load 8 bytes (pixel b + 2 ex
tra |
| 1206 + @; bytes) from prev row into d
3 |
| 1207 + vld1.8 {d0},[r2] @; load 8 bytes (pixel x + 2 ex
tra |
| 1208 + @; bytes) from curr row into d
0 |
| 1209 + vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c |
| 1210 + vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c) |
| 1211 + vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c) |
| 1212 + vaddl.u8 q5,d2,d3 @; q5 = a + b |
| 1213 + vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c) |
| 1214 + |
| 1215 + vcle.s16 q5,q2,q3 @; q5 = (pa <= pb) |
| 1216 + vcle.s16 q6,q2,q4 @; q6 = (pa <= pc) |
| 1217 + vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= p
c)) |
| 1218 + vcle.s16 q7,q3,q4 @; q7 = (pb <= pc) |
| 1219 + vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <=
pc)) |
| 1220 + vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc) |
| 1221 + |
| 1222 + vand d2,d2,d10 @; d2 = a where 1, 0 where 0 |
| 1223 + vbsl d14,d3,d1 @; d14 = b where 1, c where 0 |
| 1224 + vmvn d10,d10 @; invert d10 |
| 1225 + vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0 |
| 1226 + vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropr
iate |
| 1227 + vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x) |
| 1228 + vmov d1,d3 @; d1 = b (c for next iteration
) |
| 1229 + vst1.32 {d2[0]},[r2]! @; store 4 bytes (updated pixel
x) |
| 1230 + @; increment curr row pointer |
| 1231 + vst1.16 {d2[2]},[r2]! @; store 2 bytes (updated pixel
x) |
| 1232 + @; increment curr row pointer |
| 1233 + bne paeth_filter_6bpp_loop |
| 1234 + |
| 1235 + b paeth_filter_DONE @; exit loop when |
| 1236 + @; loop counter == rowbytes |
| 1237 + #;; ------------------------------- |
| 1238 + #;; PAETH filter, 8 bytes per pixel |
| 1239 + #;; ------------------------------- |
| 1240 +paeth_filter_8bpp: |
| 1241 + cmp r1, r0 |
| 1242 + |
| 1243 + vld1.8 {d0},[r2] @; load 8 bytes (pixel x) from
curr |
| 1244 + @; row into d0 |
| 1245 + vld1.8 {d1},[r3]! @; load 8 bytes (pixel b) from
prev |
| 1246 + @; row into d1 |
| 1247 + @; increment prev row pointer |
| 1248 + vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x |
| 1249 + vst1.8 {d2},[r2]! @; store 8 bytes (updated pixel
x) |
| 1250 + @; increment curr row pointer |
| 1251 + beq paeth_filter_DONE |
| 1252 + |
| 1253 +paeth_filter_8bpp_loop: |
| 1254 + add r1,r1,r12 @; loop counter += bpp |
| 1255 + cmp r1,r0 |
| 1256 + |
| 1257 + |
| 1258 + #;; d1[0] = c (b in the previous loop iteration) |
| 1259 + #;; d2[0] = a (x in the previous loop iteration) |
| 1260 + vld1.8 {d3},[r3]! @; load 8 bytes (pixel b) from
prev |
| 1261 + @; row into d3 |
| 1262 + vld1.8 {d0},[r2] @; load 8 bytes (pixel x) from
curr |
| 1263 + @; row into d0 |
| 1264 + vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c |
| 1265 + vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c) |
| 1266 + vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c) |
| 1267 + vaddl.u8 q5,d2,d3 @; q5 = a + b |
| 1268 + vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c) |
| 1269 + @ |
| 1270 + vcle.s16 q5,q2,q3 @; q5 = (pa <= pb) |
| 1271 + vcle.s16 q6,q2,q4 @; q6 = (pa <= pc) |
| 1272 + vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= p
c)) |
| 1273 + vcle.s16 q7,q3,q4 @; q7 = (pb <= pc) |
| 1274 + vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <=
pc)) |
| 1275 + vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc) |
| 1276 + @ |
| 1277 + vand d2,d2,d10 @; d2 = a where 1, 0 where 0 |
| 1278 + vbsl d14,d3,d1 @; d14 = b where 1, c where 0 |
| 1279 + vmvn d10,d10 @; invert d10 |
| 1280 + vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0 |
| 1281 + vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropr
iate |
| 1282 + vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x) |
| 1283 + vmov d1,d3 @; d1 = b (c for next iteration
) |
| 1284 + vst1.8 {d2},[r2]! @; store 8 bytes (updated pixel
x) |
| 1285 + @; increment curr row pointer |
| 1286 + bne paeth_filter_8bpp_loop |
| 1287 + |
| 1288 + b paeth_filter_DONE @; exit loop when |
| 1289 + @; loop counter == rowbytes |
| 1290 +paeth_filter_DONE: |
| 1291 + |
| 1292 + VPOP {q4-q7} |
| 1293 + bx r14 |
| 1294 + |
| 1295 +DONE: |
| 1296 + bx r14 |
| 1297 + |
| 1298 + |
| 1299 +.size png_read_filter_row_neon, .-png_read_filter_row_neon |
| 1300 + .END |
| 1301 +#endif |
| 1302 diff --git a/pngrutil.c b/pngrutil.c |
| 1303 index 1e2db31..adfffb2 100755 |
| 1304 --- a/pngrutil.c |
| 1305 +++ b/pngrutil.c |
| 1306 @@ -23,6 +23,10 @@ |
| 1307 # define WIN32_WCE_OLD |
| 1308 #endif |
| 1309 |
| 1310 +#if defined(__ARM_NEON__) |
| 1311 +extern void png_read_filter_row_neon(png_uint_32 rowbytes, png_byte pixel_depth
, png_bytep row, png_bytep prev_row, int filter); |
| 1312 +#endif |
| 1313 + |
| 1314 #ifdef PNG_FLOATING_POINT_SUPPORTED |
| 1315 # ifdef WIN32_WCE_OLD |
| 1316 /* The strtod() function is not supported on WindowsCE */ |
| 1317 @@ -2928,6 +2932,9 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row
_info, png_bytep row, |
| 1318 { |
| 1319 png_debug(1, "in png_read_filter_row"); |
| 1320 png_debug2(2, "row = %lu, filter = %d", png_ptr->row_number, filter); |
| 1321 +#if defined(__ARM_NEON__) |
| 1322 + png_read_filter_row_neon(row_info->rowbytes, row_info->pixel_depth, row, pre
v_row, filter); |
| 1323 +#else |
| 1324 switch (filter) |
| 1325 { |
| 1326 case PNG_FILTER_VALUE_NONE: |
| 1327 @@ -3043,6 +3050,7 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row
_info, png_bytep row, |
| 1328 *row = 0; |
| 1329 break; |
| 1330 } |
| 1331 +#endif /* #if defined(__ARM_NEON__) */ |
| 1332 } |
| 1333 |
| 1334 #ifdef PNG_SEQUENTIAL_READ_SUPPORTED |
OLD | NEW |