Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(56)

Side by Side Diff: media-libs/libpng/files/libpng-1.2.44-neon-optimized.patch

Issue 5141003: Applying Neon optimization patch to the LIBPNG library. Base URL: http://git.chromium.org/git/chromiumos-overlay.git@master
Patch Set: Created 10 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | media-libs/libpng/libpng-1.2.44.ebuild » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 diff --git a/Makefile.in b/Makefile.in
2 index b9c41f0..8472c8a 100644
3 --- a/Makefile.in
4 +++ b/Makefile.in
5 @@ -99,7 +99,8 @@ am__objects_1 = libpng_la-png.lo libpng_la-pngset.lo \
6 libpng_la-pngread.lo libpng_la-pngrio.lo libpng_la-pngwio.lo \
7 libpng_la-pngwrite.lo libpng_la-pngrtran.lo \
8 libpng_la-pngwtran.lo libpng_la-pngmem.lo \
9 - libpng_la-pngerror.lo libpng_la-pngpread.lo
10 + libpng_la-pngerror.lo libpng_la-pngpread.lo \
11 + libpng_la-png_read_filter_row_neon.lo
12 am_libpng_la_OBJECTS = $(am__objects_1)
13 libpng_la_OBJECTS = $(am_libpng_la_OBJECTS)
14 libpng_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \
15 @@ -113,7 +114,7 @@ am_libpng12_la_OBJECTS = libpng12_la-png.lo libpng12_la-pngs et.lo \
16 libpng12_la-pngwio.lo libpng12_la-pngwrite.lo \
17 libpng12_la-pngrtran.lo libpng12_la-pngwtran.lo \
18 libpng12_la-pngmem.lo libpng12_la-pngerror.lo \
19 - libpng12_la-pngpread.lo
20 + libpng12_la-pngpread.lo libpng12_la-png_read_filter_row_neon.lo
21 libpng12_la_OBJECTS = $(am_libpng12_la_OBJECTS)
22 libpng12_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \
23 $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
24 @@ -136,9 +137,9 @@ LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS ) \
25 --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
26 $(LDFLAGS) -o $@
27 SOURCES = $(libpng_la_SOURCES) $(libpng12_la_SOURCES) \
28 - $(pngtest_SOURCES)
29 + $(pngtest_SOURCES) $(pngasm_SOURCES)
30 DIST_SOURCES = $(libpng_la_SOURCES) $(libpng12_la_SOURCES) \
31 - $(pngtest_SOURCES)
32 + $(pngtest_SOURCES) $(pngasm_SOURCES)
33 man3dir = $(mandir)/man3
34 man5dir = $(mandir)/man5
35 NROFF = nroff
36 @@ -307,6 +308,8 @@ dist_man_MANS = libpng.3 libpngpf.3 png.5
37 EXTRA_SCRIPTS = libpng-config libpng12-config
38 bin_SCRIPTS = @binconfigs@
39
40 +pngasm_SOURCES = png_read_filter_row_neon.S
41 +
42 # rules to build libpng, only build the old library on request
43 lib_LTLIBRARIES = libpng12.la @compatlib@
44 EXTRA_LTLIBRARIES = libpng.la
45 @@ -363,7 +366,7 @@ all: config.h
46 $(MAKE) $(AM_MAKEFLAGS) all-am
47
48 .SUFFIXES:
49 -.SUFFIXES: .c .lo .o .obj
50 +.SUFFIXES: .c .S .lo .o .obj
51 am--refresh:
52 @:
53 $(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__conf igure_deps)
54 @@ -537,6 +540,7 @@ distclean-compile:
55 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng_la-pngwrite.Plo@am__quo te@
56 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng_la-pngwtran.Plo@am__quo te@
57 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng_la-pngwutil.Plo@am__quo te@
58 +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpng_la-png_read_filter_row_ neon.Plo@am__quote@
59 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pngtest.Po@am__quote@
60
61 .c.o:
62 @@ -553,6 +557,13 @@ distclean-compile:
63 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
64 @am__fastdepCC_FALSE@ $(COMPILE) -c `$(CYGPATH_W) '$<'`
65
66 +.S.o:
67 +@am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
68 +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
69 +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDE PBACKSLASH@
70 +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
71 +@am__fastdepCC_FALSE@ $(COMPILE) -c $<
72 +
73 .c.lo:
74 @am__fastdepCC_TRUE@ $(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $ @ $<
75 @am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
76 @@ -560,6 +571,14 @@ distclean-compile:
77 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
78 @am__fastdepCC_FALSE@ $(LTCOMPILE) -c -o $@ $<
79
80 +.S.lo:
81 +@am__fastdepCC_TRUE@ $(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $ @ $<
82 +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
83 +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=yes @AMD EPBACKSLASH@
84 +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
85 +@am__fastdepCC_FALSE@ $(LTCOMPILE) -c -o $@ $<
86 +
87 +
88 libpng_la-png.lo: png.c
89 @am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng_la_CPPFLAG S) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpng_la-png.lo -MD -MP -MF $(DEPDIR) /libpng_la-png.Tpo -c -o libpng_la-png.lo `test -f 'png.c' || echo '$(srcdir)/'` png.c
90 @am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/libpng_la-png.Tpo $(DEPDIR)/libpng_l a-png.Plo
91 @@ -665,6 +684,16 @@ libpng_la-pngpread.lo: pngpread.c
92 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
93 @am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng_la_CPPFLAG S) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpng_la-pngpread.lo `test -f 'pngp read.c' || echo '$(srcdir)/'`pngpread.c
94
95 +
96 +
97 +libpng_la-png_read_filter_row_neon.lo: png_read_filter_row_neon.S
98 +@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng_la_CPPFLAG S) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpng_la-png_read_filter_row_neon.lo -MD -MP -MF $(DEPDIR)/libpng_la-png_read_filter_row_neon.Tpo -c -o libpng_la-png _read_filter_row_neon.lo `test -f 'png_read_filter_row_neon.S' || echo '$(srcdir )/'`png_read_filter_row_neon.S
99 +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/libpng_la-png_read_filter_row_neon.T po $(DEPDIR)/libpng_la-png_read_filter_row_neon.Plo
100 +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='png_read_filter_row_neon.S' obje ct='libpng_la-png_read_filter_row_neon.lo' libtool=yes @AMDEPBACKSLASH@
101 +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
102 +@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng_la_CPPFLAG S) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpng_la-png_read_filter_row_neon.l o `test -f 'png_read_filter_row_neon.S' || echo '$(srcdir)/'`png_read_filter_row _neon.S
103 +
104 +
105 libpng12_la-png.lo: png.c
106 @am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng12_la_CPPFL AGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpng12_la-png.lo -MD -MP -MF $(DEP DIR)/libpng12_la-png.Tpo -c -o libpng12_la-png.lo `test -f 'png.c' || echo '$(sr cdir)/'`png.c
107 @am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/libpng12_la-png.Tpo $(DEPDIR)/libpng 12_la-png.Plo
108 @@ -770,6 +799,15 @@ libpng12_la-pngpread.lo: pngpread.c
109 @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
110 @am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng12_la_CPPFL AGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpng12_la-pngpread.lo `test -f ' pngpread.c' || echo '$(srcdir)/'`pngpread.c
111
112 +
113 +libpng12_la-png_read_filter_row_neon.lo: png_read_filter_row_neon.S
114 +@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng12_la_CPPFL AGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libpng12_la-png_read_filter_row_neon .lo -MD -MP -MF $(DEPDIR)/libpng12_la-png_read_filter_row_neon.Tpo -c -o libpng1 2_la-png_read_filter_row_neon.lo `test -f 'png_read_filter_row_neon.S' || echo ' $(srcdir)/'`png_read_filter_row_neon.S
115 +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/libpng12_la-png_read_filter_row_neon .Tpo $(DEPDIR)/libpng12_la-png_read_filter_row_neon.Plo
116 +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='png_read_filter_row_neon' object ='libpng12_la-png_read_filter_row_neon.lo' libtool=yes @AMDEPBACKSLASH@
117 +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
118 +@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libpng12_la_CPPFL AGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libpng12_la-png_read_filter_row_ne on.lo `test -f 'png_read_filter_row_neon.S' || echo '$(srcdir)/'`png_read_filter _row_neon.S
119 +
120 +
121 mostlyclean-libtool:
122 -rm -f *.lo
123
124 diff --git a/png_read_filter_row_neon.S b/png_read_filter_row_neon.S
125 new file mode 100755
126 index 0000000..77ec7bd
127 --- /dev/null
128 +++ b/png_read_filter_row_neon.S
129 @@ -0,0 +1,1172 @@
130 +#if defined(__ARM_NEON__)
131 +#; Copyright (c) 2010, Code Aurora Forum. All rights reserved.
132 +#;
133 +#; Redistribution and use in source and binary forms, with or without
134 +#; modification, are permitted provided that the following conditions are
135 +#; met:
136 +#; * Redistributions of source code must retain the above copyright
137 +#; notice, this list of conditions and the following disclaimer.
138 +#; * Redistributions in binary form must reproduce the above
139 +#; copyright notice, this list of conditions and the following
140 +#; disclaimer in the documentation and/or other materials provided
141 +#; with the distribution.
142 +#; * Neither the name of Code Aurora Forum, Inc. nor the names of its
143 +#; contributors may be used to endorse or promote products derived
144 +#; from this software without specific prior written permission.
145 +#;
146 +#; THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
147 +#; WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
148 +#; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT
149 +#; ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
150 +#; BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
151 +#; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
152 +#; SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
153 +#; BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
154 +#; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
155 +#; OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
156 +#; IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
157 +
158 +#;============================================================================= =
159 +
160 + .code 32 @; Code is ARM ISA
161 +#;============================================================================= =
162 +
163 + .global png_read_filter_row_neon
164 +
165 +#;============================================================================= =
166 +#; INPUTS: r0 rowbytes: number of bytes in current row
167 +#; r1 pixel_depth: number of bits per pixel
168 +#; r2 row: pointer to start of current row
169 +#; r3 prev_row: pointer to start of previous row
170 +#; [sp,#0] filter: filter type
171 +#;
172 +#; NOTE: Don't touch r5-r11
173 +#;============================================================================= =
174 +.balign 32
175 +.type png_read_filter_row_neon, %function
176 +png_read_filter_row_neon:
177 +
178 + ldr r12,[sp,#0]
179 +
180 + cmp r12,#0
181 + beq DONE
182 +
183 + cmp r12,#1
184 + beq sub_filter
185 +
186 + cmp r12,#2
187 + beq up_filter
188 +
189 + cmp r12,#3
190 + beq avg_filter
191 +
192 + cmp r12,#4
193 + beq paeth_filter
194 +
195 + b DONE
196 +
197 + #;; ---------------
198 + #;; SUB filter type
199 + #;; ---------------
200 +
201 +
202 +sub_filter:
203 +
204 + stmdb sp!, {r4}
205 +
206 + add r1,r1,#7 @; bpp = bytes per pixel
207 + lsr r1,r1,#3 @; = (pixel_depth + 7) >> 3
208 + mov r12,r1
209 +
210 + #;; r0 = rowbytes
211 + #;; r1 = loop counter = bpp (initially)
212 + #;; r2 = row pointer
213 + #;; r12 = bpp = loop/pointer increment value
214 +
215 + cmp r1,r0
216 + beq sub_filter_exit @; exit if bpp == rowbytes
217 +
218 + cmp r12,#1
219 + beq sub_filter_1bpp
220 +
221 + cmp r12,#2
222 + beq sub_filter_2bpp
223 +
224 + cmp r12,#3
225 + beq sub_filter_3bpp
226 +
227 + cmp r12,#4
228 + beq sub_filter_4bpp
229 +
230 + cmp r12,#6
231 + beq sub_filter_6bpp
232 +
233 + cmp r12,#8
234 + beq sub_filter_8bpp
235 +
236 +sub_filter_exit:
237 + b sub_filter_DONE @; return
238 +
239 +
240 +sub_filter_1bpp:
241 +
242 + #;; ----------------------------
243 + #;; SUB filter, 1 byte per pixel
244 + #;; ----------------------------
245 +
246 + lsrs r4,r0,#4 @; r1 = floor(rowbytes/4)
247 + @; = iteration count for loop 16
248 + beq sub_filter_1bpp_16bytes_done
249 +
250 + vmov.i8 d21, #0
251 + vld1.8 {d16,d17}, [r2] @; load 16 pixels
252 + @; d16 = a b c d e f g h
253 + @; d17 = i j k l m n o p
254 +
255 + mov r1, #0
256 +sub_filter_1bpp_16bytes:
257 +
258 +
259 +
260 +
261 + vshl.i64 d18, d16, #8 @; d18 = 0 a b c d e f g
262 + vadd.i8 d18, d16, d18 @; d18 = a a+b b+c c+d d+e e+f f +g g+h
263 +
264 + vshl.i64 d18, d18, #8 @; d18 = 0 a a+b b+c c+d d+e e+f f+g
265 + vadd.i8 d18, d16, d18 @; d18 = a a+b a+b+c b+c+d c+d+e d+e+f e+f+g f+g+h
266 +
267 + vshl.i64 d18, d18, #8 @; shift add continuously to pro page the sum of previous
268 + vadd.i8 d18, d16, d18 @; and current pixels
269 +
270 + vshl.i64 d18, d18, #8
271 + vadd.i8 d18, d16, d18
272 +
273 + vshl.i64 d18, d18, #8
274 + vadd.i8 d18, d16, d18
275 +
276 + vshl.i64 d18, d18, #8
277 + vadd.i8 d18, d16, d18
278 +
279 + vshl.i64 d18, d18, #8
280 + vadd.i8 d18, d16, d18 @; maximum data size for shift i s 64 bits i.e. doubleword.
281 + @; after computing thh value of all the pixels in the double word
282 + @; extract the last computed val ue which will be used by
283 + @; the next set of pixels (i.e n ext doubleword)
284 + vext.8 d22, d18, d21, #7 @; extract the updated value of d18[7] i.e a+b+c+d+e+f+h
285 + vadd.i8 d17, d17, d22 @; d17 = a+b+c+d+e+f+g+h+i j k l m n o p
286 +
287 + vshl.i64 d19, d17, #8 @; continue shift-add as the fir st half
288 + vadd.i8 d19, d17, d19
289 +
290 + vshl.i64 d19, d19, #8
291 + vadd.i8 d19, d17, d19
292 +
293 + vshl.i64 d19, d19, #8
294 + vadd.i8 d19, d17, d19
295 +
296 + vshl.i64 d19, d19, #8
297 + vadd.i8 d19, d17, d19
298 +
299 + vshl.i64 d19, d19, #8
300 + vadd.i8 d19, d17, d19
301 +
302 + vshl.i64 d19, d19, #8
303 + vadd.i8 d19, d17, d19
304 +
305 + vshl.i64 d19, d19, #8
306 + vadd.i8 d19, d17, d19
307 +
308 + vst1.8 {d18,d19},[r2]! @; store the result back
309 +
310 + add r1, r1, #16 @; add 16 to the loop counter(n o of bytes completed)
311 + subs r4,r4,#1 @; decrement iteration count
312 + beq sub_filter_1bpp_16bytes_adjust
313 +
314 +
315 + vext.8 d22, d19, d21, #7 @; more iterations to go
316 + @; extract the last computed va lue
317 + vld1.8 {d16,d17}, [r2] @; load the next 16 bytes
318 + vadd.i8 d16, d16, d22 @; set up the input by adding t he previous pixel
319 + @; value to the input
320 + b sub_filter_1bpp_16bytes
321 +
322 +sub_filter_1bpp_16bytes_adjust:
323 +
324 + cmp r1, r0 @; no more pixels left .. exit
325 + sub r2, r2, #1 @; more pixels remaining
326 + @; r2 points to the current pix el adjust it
327 + @; so that it points to the pre v pixel for the below loop
328 + beq sub_filter_DONE
329 +
330 +sub_filter_1bpp_16bytes_done:
331 +
332 +
333 + vld1.8 {d0[0]},[r2]! @; load 1 byte (1 pixel) into D 0[0]
334 + @; increment row pointer
335 +sub_filter_1bpp_loop:
336 + add r1,r1,r12 @; loop counter += bpp
337 + cmp r1,r0 @;
338 +
339 + vld1.8 {d2[0]},[r2] @; load 1 byte (current pixel) into D2[0]
340 +
341 + vadd.i8 d0,d0,d2 @; vector add 1 byte of previou s pixel with
342 + @; 1 byte of current pixel
343 + vst1.8 {d0[0]},[r2]! @; store 1 byte (updated pixel) back
344 + @; into row pointer location a nd increment
345 + @; row pointer
346 +
347 + bne sub_filter_1bpp_loop @; loop back until loop counter == rowbytes
348 +
349 + b sub_filter_DONE @; return
350 +
351 + #;; -----------------------------
352 + #;; SUB filter, 2 bytes per pixel
353 + #;; -----------------------------
354 +sub_filter_2bpp:
355 +
356 + lsrs r4,r0,#4 @; r1 = floor(rowbytes/4)
357 + @; = iteration count for loo p16
358 + beq sub_filter_2bpp_16bytes_done
359 +
360 + vmov.i8 d21, #0
361 + vld1.8 {d16,d17}, [r2] @; load 16 bytes to q8
362 + @; d16 = a b c d e f g h
363 + @; d17 = i j k l m n o p
364 + mov r1, #0
365 +sub_filter_2bpp_16bytes:
366 +
367 + vshl.i64 d18, d16, #16 @; each pixel is 2bytes .. shi ft by 16 bits to get previous pixel
368 + vadd.i8 d18, d16, d18 @; add to the current pixel
369 +
370 + vshl.i64 d18, d18, #16 @; shift-add to propagate the c omputed sum as the case for 1bpp
371 + vadd.i8 d18, d16, d18
372 +
373 + vshl.i64 d18, d18, #16
374 + vadd.i8 d18, d16, d18
375 +
376 +
377 + vext.8 d22, d18, d21, #6 @; extract the last computed va lue (i.e. last 2 bytes)
378 + vadd.i8 d17, d17, d22 @; add the last computed pixel to the input
379 +
380 + vshl.i64 d19, d17, #16
381 + vadd.i8 d19, d17, d19
382 +
383 + vshl.i64 d19, d19, #16
384 + vadd.i8 d19, d17, d19
385 +
386 + vshl.i64 d19, d19, #16
387 + vadd.i8 d19, d17, d19
388 +
389 +
390 + vst1.8 {d18,d19},[r2]! @; store the result back
391 +
392 +
393 + add r1, r1, #16 @; add 16 to the loop counter(n o of bytes completed)
394 + subs r4,r4,#1 @; decrement iteration count
395 + beq sub_filter_2bpp_16bytes_adjust
396 +
397 +
398 + vext.8 d22, d19, d21, #6 @; extract the last computed va lue
399 + @; add the last computed pixel to the input
400 + vld1.8 {d16,d17}, [r2]
401 + vadd.i8 d16, d16, d22
402 +
403 + b sub_filter_2bpp_16bytes
404 +
405 +
406 +sub_filter_2bpp_16bytes_adjust:
407 +
408 + cmp r1, r0 @; no more pixels left .. exit
409 + sub r2, r2, #2 @; more pixels remaining
410 + @; r2 points to the current pix el adjust it
411 + @; so that it points to the pre v pixel for the below loop
412 + beq sub_filter_DONE
413 +
414 +sub_filter_2bpp_16bytes_done:
415 +
416 + vld1.16 {d0[0]},[r2]! @; load 2 bytes (1 pixel) into D0[0]
417 + @; increment row pointer
418 +sub_filter_2bpp_loop:
419 + add r1,r1,r12 @; loop counter += bpp
420 + cmp r1,r0 @;
421 +
422 + vld1.16 {d2[0]},[r2] @; load 2 bytes (current pixel) into D2[0]
423 + vadd.i8 d0,d0,d2 @; vector add 2 bytes of previo us pixel with
424 + @; 2 bytes of curren t pixel
425 + vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel ) back
426 + @; into row pointer location a nd increment
427 + @; row pointer
428 +
429 + bne sub_filter_2bpp_loop @; loop back until loop counter == rowbytes
430 + @
431 + b sub_filter_DONE @ ; return
432 +
433 + #;; -----------------------------
434 + #;; SUB filter, 3 bytes per pixel
435 + #;; -----------------------------
436 +sub_filter_3bpp:
437 + vld1.32 {d0[0]},[r2], r12 @; load 4 bytes (1 pixel + 1 ex tra byte) into D0[0]
438 + @; increment row pointer by bpp
439 +sub_filter_3bpp_loop:
440 + add r1,r1,r12 @; loop counter += bpp
441 + cmp r1,r0 @;
442 +
443 + vld1.32 {d2[0]},[r2] @; load 4 bytes (current pixel + 1 extra byte) into D2[0]
444 + vadd.i8 d0,d0,d2 @; vector add 3 bytes of previo us pixel with
445 + @; 3 bytes of curren t pixel
446 + vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel ) back
447 + @; into row pointer location a nd increment
448 + @; row pointer
449 + vst1.8 {d0[2]},[r2]! @; store 1 byte (updated pixel) back
450 + @; into row pointer location a nd increment
451 + @; row pointer
452 +
453 + bne sub_filter_3bpp_loop @; loop back until loop counter == rowbytes
454 +
455 + b sub_filter_DONE @; return
456 +
457 + #;; -----------------------------
458 + #;; SUB filter, 4 bytes per pixel
459 + #;; -----------------------------
460 +sub_filter_4bpp:
461 + vld1.32 {d0[0]},[r2]! @; load 4 bytes (1 pixel) into D0[0]
462 + @; increment row pointer
463 +sub_filter_4bpp_loop: @
464 + add r1,r1,r12 @; loop counter += bpp
465 + cmp r1,r0 @;
466 +
467 +
468 + vld1.32 {d2[0]},[r2] @; load 4 bytes (current pixel) into D2[0]
469 + vadd.i8 d0,d0,d2 @; vector add 4 bytes of previo us pixel with
470 + @; 4 bytes of curren t pixel
471 + vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel ) back
472 + @; into row pointer location a nd increment
473 + @; row pointer
474 +
475 + bne sub_filter_4bpp_loop @; loop back until loop counter == rowbytes
476 +
477 + b sub_filter_DONE @; return
478 +
479 + #;; -----------------------------
480 + #;; SUB filter, 6 bytes per pixel
481 + #;; -----------------------------
482 +sub_filter_6bpp:
483 + vld1.8 {d0},[r2],r12 @; load 8 bytes (1 pixel + 2 ext ra bytes) into D0
484 + @; increment row pointer by bpp
485 +sub_filter_6bpp_loop: @
486 + add r1,r1,r12 @; loop counter += bpp
487 + cmp r1,r0 @;
488 +
489 + vld1.8 {d2},[r2] @; load 8 bytes (1 pixel + 2 ext ra bytes) into D2
490 + vadd.i8 d0,d0,d2 @; vector add 6 bytes of previou s pixel with
491 + @; 6 bytes of current pixel
492 + vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel) back
493 + @; into row pointer location an d increment
494 + @; row pointer
495 + vst1.16 {d0[2]},[r2]! @; store 2 bytes (updated pixel) back
496 + @; into row pointer location an d increment
497 + @; row pointer
498 +
499 + bne sub_filter_6bpp_loop @; loop back until loop counter == rowbytes
500 +
501 + b sub_filter_DONE @; return
502 +
503 + #;; -----------------------------
504 + #;; SUB filter, 8 bytes per pixel
505 + #;; -----------------------------
506 +sub_filter_8bpp:
507 + vld1.8 {d0},[r2]! @; load 8 bytes (1 pixel) into D 0
508 + @; increment row pointer
509 +sub_filter_8bpp_loop: @
510 + add r1,r1,r12 @; loop counter += bpp
511 + cmp r1,r0 @;
512 + vld1.8 {d2},[r2] @; load 8 bytes (current pixel) into D2
513 + vadd.i8 d0,d0,d2 @; vector add 8 bytes of previou s pixel with
514 + @; 8 bytes of current pixel
515 + vst1.8 {d0},[r2]! @; store 8 bytes (updated pixel) back
516 + @; into row pointer location an d increment
517 + @; row pointer
518 +
519 +
520 + bne sub_filter_8bpp_loop @; loop back until loop counter == rowbytes
521 + @
522 + b sub_filter_DONE @ ; return
523 +
524 +sub_filter_DONE:
525 +
526 + ldmia sp!, {r4}
527 + bx r14
528 +
529 + #;; --------------
530 + #;; UP filter type
531 + #;; --------------
532 +up_filter:
533 +
534 + #;; r0 = rowbytes
535 + #;; r1 = pixel_depth (not required for UP filter type)
536 + #;; r2 = row pointer
537 + #;; r3 = previous row pointer
538 +
539 +
540 + lsrs r1,r0,#5 @; r1 = floor(rowbytes/32)
541 + @; = iteration count for loop 32
542 + beq up_filter_32bytes_proc_done
543 +
544 +
545 +up_filter_32bytes_proc:
546 +
547 +
548 + mov r12, r2
549 +
550 + vld1.8 {q0},[r3]! @; load 32 bytes from previous
551 + vld1.8 {q2},[r3]! @; row and increment pointer
552 + @
553 + @
554 + vld1.8 {q1},[r12]! @; load 32 bytes from current ro w
555 + vld1.8 {q3},[r12]! @
556 + @
557 + @
558 + @
559 + vadd.i8 q0,q0,q1 @; vector add of 16 bytes
560 + vadd.i8 q2,q2,q3 @
561 + @
562 + @
563 + @
564 + vst1.8 {q0},[r2]! @; store 32 bytes to current row
565 + vst1.8 {q2},[r2]! @
566 + @; and increment pointer
567 + sub r0,r0,#32 @; subtract 32 from rowbytes
568 + subs r1,r1,#1 @; decrement iteration count
569 + bne up_filter_32bytes_proc
570 +
571 +
572 +
573 +up_filter_32bytes_proc_done:
574 +
575 + lsrs r1,r0,#4 @; r1 = floor(rowbytes/16)
576 + @; = iteration count for loop 16
577 + beq up_filter_16bytes_proc_done
578 +
579 +up_filter_16bytes_proc:
580 +
581 + vld1.8 {q0},[r3]! @; load 16 bytes from previous
582 + @; row and increment pointer
583 + vld1.8 {q1},[r2] @; load 16 bytes from current ro w
584 + vadd.i8 q0,q0,q1 @; vector add of 16 bytes
585 + vst1.8 {q0},[r2]! @; store 16 bytes to current row
586 + @; and increment pointer
587 + sub r0,r0,#16 @; subtract 16 from rowbytes
588 + subs r1,r1,#1 @; decrement iteration count
589 + bne up_filter_16bytes_proc
590 +
591 +up_filter_16bytes_proc_done:
592 +
593 + lsrs r1,r0,#3 @; r1 = floor(rowbytes/8)
594 + beq up_filter_8bytes_proc_done
595 +
596 +up_filter_8bytes_proc:
597 +
598 + vld1.8 {d0},[r3]! @; load 8 bytes from previous
599 + @; row and increment pointer
600 + vld1.8 {d2},[r2] @; load 8 bytes from current row
601 + vadd.i8 d0,d0,d2 @; vector add 8 bytes
602 + vst1.8 {d0},[r2]! @; store 8 bytes to current row
603 + @; and increment pointer
604 + sub r0,r0,#8 @; subtract 8 from rowbytes
605 +
606 +up_filter_8bytes_proc_done:
607 +
608 + lsrs r1,r0,#2 @; r1 = floor(rowbytes/4)
609 + beq up_filter_4bytes_proc_done
610 +
611 +up_filter_4bytes_proc:
612 +
613 + vld1.32 {d0[0]},[r3]! @; load 4 bytes from previous ro w
614 + @; and increment pointer
615 + vld1.32 {d2[0]},[r2] @; load 4 bytes from current row
616 + vadd.i8 d0,d0,d2 @; vector add 4 bytes
617 + vst1.32 {d0[0]},[r2]! @; store 4 bytes to current row
618 + @; and increment pointer
619 + sub r0,r0,#4 @; subtract 4 from rowbytes
620 +
621 +up_filter_4bytes_proc_done:
622 +
623 + lsrs r1,r0,#1 @; r1 = floor(rowbytes/2)
624 + beq up_filter_2bytes_proc_done
625 +
626 +up_filter_2bytes_proc:
627 +
628 + vld1.16 {d0[0]},[r3]! @; load 2 bytes from previous ro w
629 + @; and increment pointer
630 + vld1.16 {d2[0]},[r2] @; load 2 bytes from current row
631 + vadd.i8 d0,d0,d2 @; vector add 2 bytes
632 + vst1.16 {d0[0]},[r2]! @; store 2 bytes to current row
633 + @; and increment pointer
634 + sub r0,r0,#2 @; subtract 2 from rowbytes
635 +
636 +up_filter_2bytes_proc_done:
637 +
638 + cmp r0,#0
639 + beq up_filter_1byte_proc_done
640 +
641 +up_filter_1byte_proc:
642 +
643 + vld1.8 {d0[0]},[r3]! @; load 1 byte from previous row
644 + @; and increment pointer
645 + vld1.8 {d2[0]},[r2] @; load 1 byte from current row
646 + vadd.i8 d0,d0,d2 @; vector add 1 byte
647 + vst1.8 {d0[0]},[r2]! @; store 1 byte to current row
648 + @; and increment pointer
649 +up_filter_1byte_proc_done:
650 +
651 + b DONE
652 +
653 + #;; ---------------
654 + #;; AVG filter type
655 + #;; ---------------
656 +avg_filter:
657 +
658 + add r1,r1,#7 @; bpp = byptes per pixel
659 + lsr r1,r1,#3 @; = (pixel_depth + 7) >> 3
660 + mov r12,r1
661 +
662 + #;; r0 = rowbytes
663 + #;; r1 = loop counter = bpp (initially)
664 + #;; r2 = row pointer
665 + #;; r3 = previous row pointer
666 + #;; r12 = bpp = loop/pointer increment value
667 +
668 + cmp r12,#1
669 + beq avg_filter_1bpp
670 +
671 + cmp r12,#2
672 + beq avg_filter_2bpp
673 +
674 + cmp r12,#3
675 + beq avg_filter_3bpp
676 +
677 + cmp r12,#4
678 + beq avg_filter_4bpp
679 +
680 + cmp r12,#6
681 + beq avg_filter_6bpp
682 +
683 + cmp r12,#8
684 + beq avg_filter_8bpp
685 +
686 +avg_filter_exit:
687 + b DONE @; return
688 +
689 + #;; ----------------------------
690 + #;; AVG filter, 1 byte per pixel
691 + #;; ----------------------------
692 +avg_filter_1bpp:
693 +
694 + cmp r1,r0
695 +
696 + vld1.8 {d0[0]},[r2] @; load 1 byte (pixel x) from c urr
697 + @; row into d0[0]
698 + vld1.8 {d1[0]},[r3]! @; load 1 byte (pixel b) from p rev
699 + @; row into d1[0]
700 + @; increment prev row pointer
701 + vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add
702 + @; to pixel x
703 + vst1.8 {d0[0]},[r2]! @; store 1 byte (updated pixel x)
704 + @; increment curr row pointer
705 + @; updated pixel x is now pixel a
706 + beq DONE
707 +
708 +avg_filter_1bpp_loop:
709 + add r1,r1,r12 @; loop counter += bpp
710 + cmp r1,r0
711 +
712 +
713 + vld1.8 {d2[0]},[r2] @; load 1 byte (pixel x) from c urr
714 + @; row into d2[0]
715 + vld1.8 {d1[0]},[r3]! @; load 1 byte (pixel b) from p rev
716 + @; row into d1[0]
717 + vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b)
718 + vshrn.i16 d1,q2,#1 @; d1[0] = (a + b)/2
719 + vadd.i8 d0,d2,d1 @; d0[0] = x + ((a + b)/2)
720 + vst1.8 {d0[0]},[r2]! @; store 1 byte (updated pixel x)
721 + @; increment curr row pointer
722 + bne avg_filter_1bpp_loop
723 +
724 + b DONE @; exit loop when
725 + @; loop counter == rowbytes
726 + #;; -----------------------------
727 + #;; AVG filter, 2 bytes per pixel
728 + #;; -----------------------------
729 +avg_filter_2bpp:
730 +
731 + cmp r1,r0
732 +
733 + vld1.16 {d0[0]},[r2] @; load 2 bytes (pixel x) from curr
734 + @; row into d0[0]
735 + vld1.16 {d1[0]},[r3]! @; load 2 bytes (pixel b) from prev
736 + @; row into d1[0]
737 + @; increment prev row pointer
738 + vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add
739 + @; to pixel x
740 + vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel x)
741 + @; increment curr row pointer
742 + @; updated pixel x is now pixel a
743 + beq DONE
744 +
745 +avg_filter_2bpp_loop:
746 + add r1,r1,r12 @; loop counter += bpp
747 + cmp r1,r0
748 +
749 +
750 + vld1.16 {d2[0]},[r2] @; load 2 bytes (pixel x) from curr
751 + @; row into d2[0]
752 + vld1.16 {d1[0]},[r3]! @; load 2 bytes (pixel b) from prev
753 + @; row into d1[0]
754 + vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b)
755 + vshrn.i16 d1,q2,#1 @; d1[0] = (a + b)/2
756 + vadd.i8 d0,d2,d1 @; d0[0] = x + ((a + b)/2)
757 + vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel x)
758 + @; increment curr row pointer
759 +
760 + bne avg_filter_2bpp_loop
761 +
762 + b DONE @; exit loop when
763 + @; loop counter == rowbytes
764 +
765 + #;; -----------------------------
766 + #;; AVG filter, 3 bytes per pixel
767 + #;; -----------------------------
768 +avg_filter_3bpp:
769 +
770 + cmp r1,r0
771 +
772 + vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x + 1 ex tra
773 + @; byte) from curr row into d0 [0]
774 + vld1.32 {d1[0]},[r3],r12 @; load 4 bytes (pixel b + 1 ex tra
775 + @; byte) from prev row into d1 [0]
776 + @; increment prev row pointer
777 + vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add
778 + @; to pixel x
779 + vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel x)
780 + @; increment curr row pointer
781 + vst1.8 {d0[2]},[r2]! @; store 1 byte (updated pixel x)
782 + @; increment curr row pointer
783 + @; updated pixel x is now pixel a
784 + beq DONE
785 +
786 +avg_filter_3bpp_loop:
787 + add r1,r1,r12 @; loop counter += bpp
788 + cmp r1,r0
789 +
790 + vld1.32 {d2[0]},[r2] @; load 4 bytes (pixel x + 1 ex tra
791 + @; byte) from curr row into d2 [0]
792 + vld1.32 {d1[0]},[r3],r12 @; load 4 bytes (pixel b + 1 ex tra
793 + @; byte) from prev row into d1 [0]
794 + vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b)
795 + vshrn.i16 d1,q2,#1 @; d1[0] = (a + b)/2
796 + vadd.i8 d0,d2,d1 @; d0[0] = x + ((a + b)/2)
797 + vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel x)
798 + @; increment curr row pointer
799 + vst1.8 {d0[2]},[r2]! @; store 1 byte (updated pixel x)
800 + @; increment curr row pointer
801 +
802 + bne avg_filter_3bpp_loop
803 +
804 + b DONE @; exit loop when
805 + @; loop counter == rowbytes
806 + #;; -----------------------------
807 + #;; AVG filter, 4 bytes per pixel
808 + #;; -----------------------------
809 +avg_filter_4bpp:
810 +
811 + cmp r1,r0
812 +
813 + vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x) from curr
814 + @; row into d0[0]
815 + vld1.32 {d1[0]},[r3]! @; load 4 bytes (pixel b) from prev
816 + @; row into d1[0]
817 + @; increment prev row pointer
818 + vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add
819 + @; to pixel x
820 + vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel x)
821 + @; increment curr row pointer
822 + @; updated pixel x is now pixel a
823 + beq DONE
824 +
825 +avg_filter_4bpp_loop:
826 + add r1,r1,r12 @; loop counter += bpp
827 + cmp r1,r0
828 +
829 +
830 + vld1.32 {d2[0]},[r2] @; load 4 bytes (pixel x) from curr
831 + @; row into d2[0]
832 + vld1.32 {d1[0]},[r3]! @; load 4 bytes (pixel b) from prev
833 + @; row into d1[0]
834 + vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b)
835 + vshrn.i16 d1,q2,#1 @; d1[0] = (a + b)/2
836 + vadd.i8 d0,d2,d1 @; d0[0] = x + ((a + b)/2)
837 + vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel x)
838 + @; increment curr row pointer
839 + bne avg_filter_4bpp_loop
840 +
841 + b DONE @; exit loop when
842 + @; loop counter == rowbytes
843 + #;; -----------------------------
844 + #;; AVG filter, 6 bytes per pixel
845 + #;; -----------------------------
846 +avg_filter_6bpp:
847 +
848 + cmp r1,r0
849 +
850 + vld1.8 {d0},[r2] @; load 8 bytes (pixel x + 2 ex tra
851 + @; bytes) from curr row into d 0
852 + vld1.8 {d1},[r3],r12 @; load 8 bytes (pixel b + 2 ex tra
853 + @; bytes) from prev row into d 1
854 + @; increment prev row pointer
855 + vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add
856 + @; to pixel x
857 + vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel x)
858 + @; increment curr row pointer
859 + @; updated pixel x is now pixel a
860 + vst1.16 {d0[2]},[r2]! @; store 2 bytes (updated pixel x)
861 + @; increment curr row pointer
862 + @; updated pixel x is now pixel a
863 + beq DONE
864 +
865 +avg_filter_6bpp_loop:
866 + add r1,r1,r12 @; loop counter += bpp
867 + cmp r1,r0
868 +
869 +
870 + vld1.8 {d2},[r2] @; load 8 bytes (pixel x + 2 ex tra
871 + @; bytes) from curr row into d 2
872 + vld1.8 {d1},[r3],r12 @; load 8 bytes (pixel b + 2 ex tra
873 + @; bytes) from prev row into d 1
874 + vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b)
875 + vshrn.i16 d1,q2,#1 @; d1 = (a + b)/2
876 + vadd.i8 d0,d2,d1 @; d0 = x + ((a + b)/2)
877 + vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel x)
878 + @; increment curr row pointer
879 + vst1.16 {d0[2]},[r2]! @; store 2 bytes (updated pixel x)
880 + @; increment curr row pointer
881 + bne avg_filter_6bpp_loop
882 +
883 + b DONE @; exit loop when
884 + @; loop counter == rowbytes
885 + #;; -----------------------------
886 + #;; AVG filter, 8 bytes per pixel
887 + #;; -----------------------------
888 +avg_filter_8bpp:
889 +
890 + cmp r1,r0
891 +
892 + vld1.8 {d0},[r2] @; load 8 bytes (pixel x) from curr
893 + @; row into d0
894 + vld1.8 {d1},[r3]! @; load 8 bytes (pixel b) from prev
895 + @; row into d1
896 + @; increment prev row pointer
897 + vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add
898 + @; to pixel x
899 + vst1.8 {d0},[r2]! @; store 8 bytes (updated pixel x)
900 + @; increment curr row pointer
901 + @; updated pixel x is now pixel a
902 + beq DONE
903 +avg_filter_8bpp_loop:
904 + add r1,r1,r12 @; loop counter += bpp
905 + cmp r1,r0
906 +
907 +
908 + vld1.8 {d2},[r2] @; load 8 bytes (pixel x) from curr
909 + @; row into d2
910 + vld1.8 {d1},[r3]! @; load 8 bytes (pixel b) from prev
911 + @; row into d1
912 + vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b)
913 + vshrn.i16 d1,q2,#1 @; d1 = (a + b)/2
914 + vadd.i8 d0,d2,d1 @; d0 = x + ((a + b)/2)
915 + vst1.8 {d0},[r2]! @; store 8 bytes (updated pixel x)
916 + @; increment curr row pointer
917 + bne avg_filter_8bpp_loop
918 +
919 + b DONE @; exit loop when
920 + @; loop counter == rowbytes
921 + #;; -----------------
922 + #;; PAETH filter type
923 + #;; -----------------
924 +paeth_filter:
925 +
926 + VPUSH {q4-q7}
927 + add r1,r1,#7 @; bpp = bytes per pixel
928 + lsr r1,r1,#3 @; = (pixel_depth + 7) >> 3
929 + mov r12,r1
930 +
931 + #;; r0 = rowbytes
932 + #;; r1 = loop counter = bpp (initially)
933 + #;; r2 = row pointer
934 + #;; r3 = previous row pointer
935 + #;; r12 = bpp = loop/pointer increment value
936 +
937 +
938 + cmp r12,#1
939 + beq paeth_filter_1bpp
940 +
941 + cmp r12,#2
942 + beq paeth_filter_2bpp
943 +
944 + cmp r12,#3
945 + beq paeth_filter_3bpp
946 +
947 + cmp r12,#4
948 + beq paeth_filter_4bpp
949 +
950 + cmp r12,#6
951 + beq paeth_filter_6bpp
952 +
953 + cmp r12,#8
954 + beq paeth_filter_8bpp
955 +
956 +paeth_filter_exit:
957 + b paeth_filter_DONE @; return
958 +
959 + #;; ------------------------------
960 + #;; PAETH filter, 1 byte per pixel
961 + #;; ------------------------------
962 +paeth_filter_1bpp:
963 +
964 + cmp r1, r0
965 +
966 + vld1.8 {d0[0]},[r2] @; load 1 byte (pixel x) from c urr
967 + @; row into d0[0]
968 + vld1.8 {d1[0]},[r3]! @; load 1 byte (pixel b) from p rev
969 + @; row into d1[0]
970 + @; increment prev row pointer
971 + vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x
972 + vst1.8 {d2[0]},[r2]! @; store 1 byte (updated pixel x)
973 + @; increment curr row pointer
974 +
975 + beq paeth_filter_DONE
976 +
977 +paeth_filter_1bpp_loop:
978 + add r1,r1,r12 @; increment curr row pointer
979 + cmp r1,r0
980 +
981 +
982 + #;; d1[0] = c (b in the previous loop iteration)
983 + #;; d2[0] = a (x in the previous loop iteration)
984 + vld1.8 {d3[0]},[r3]! @; load 1 byte (pixel b) from p rev
985 + @; row into d3[0]
986 + vld1.8 {d0[0]},[r2] @; load 1 byte (pixel x) from c urr
987 + @; row into d0[0]
988 + vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c
989 + vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c)
990 + vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c)
991 + vaddl.u8 q5,d2,d3 @; q5 = a + b
992 + vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c)
993 +
994 + vcle.s16 q5,q2,q3 @; q5 = (pa <= pb)
995 + vcle.s16 q6,q2,q4 @; q6 = (pa <= pc)
996 + vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= p c))
997 + vcle.s16 q7,q3,q4 @; q7 = (pb <= pc)
998 + vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc))
999 + vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc)
1000 + @
1001 + vand d2,d2,d10 @; d2 = a where 1, 0 where 0
1002 + vbsl d14,d3,d1 @; d14 = b where 1, c where 0
1003 + vmvn d10,d10 @; invert d10
1004 + vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0
1005 + vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropr iate
1006 + vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x)
1007 + vmov d1,d3 @; d1 = b (c for next iteration )
1008 + vst1.8 {d2[0]},[r2]! @; store 1 byte (updated pixel x)
1009 +
1010 +
1011 + bne paeth_filter_1bpp_loop
1012 +
1013 + b paeth_filter_DONE @; exit loop when
1014 + @; loop counter == rowbytes
1015 + #;; -------------------------------
1016 + #;; PAETH filter, 2 bytes per pixel
1017 + #;; -------------------------------
1018 +paeth_filter_2bpp:
1019 +
1020 + cmp r1, r0
1021 +
1022 + vld1.16 {d0[0]},[r2] @; load 2 bytes (pixel x) from curr
1023 + @; row into d0[0]
1024 + vld1.16 {d1[0]},[r3]! @; load 2 bytes (pixel b) from prev
1025 + @; row into d1[0]
1026 + @; increment prev row pointer
1027 + vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x
1028 + vst1.16 {d2[0]},[r2]! @; store 2 bytes (updated pixel x)
1029 + @; increment curr row pointer
1030 + beq paeth_filter_DONE
1031 +
1032 +paeth_filter_2bpp_loop:
1033 + add r1,r1,r12 @; loop counter += bpp
1034 + cmp r1,r0
1035 +
1036 + #;; d1[0] = c (b in the previous loop iteration)
1037 + #;; d2[0] = a (x in the previous loop iteration)
1038 + vld1.16 {d3[0]},[r3]! @; load 2 bytes (pixel b) from prev
1039 + @; row into d3[0]
1040 + vld1.16 {d0[0]},[r2] @; load 2 bytes (pixel x) from curr
1041 + @; row into d0[0]
1042 + vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c
1043 + vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c)
1044 + vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c)
1045 + vaddl.u8 q5,d2,d3 @; q5 = a + b
1046 + vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c)
1047 +
1048 + vcle.s16 q5,q2,q3 @; q5 = (pa <= pb)
1049 + vcle.s16 q6,q2,q4 @; q6 = (pa <= pc)
1050 + vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= p c))
1051 + vcle.s16 q7,q3,q4 @; q7 = (pb <= pc)
1052 + vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc))
1053 + vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc)
1054 +
1055 + vand d2,d2,d10 @; d2 = a where 1, 0 where 0
1056 + vbsl d14,d3,d1 @; d14 = b where 1, c where 0
1057 + vmvn d10,d10 @; invert d10
1058 + vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0
1059 + vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropr iate
1060 + vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x)
1061 + vmov d1,d3 @; d1 = b (c for next iteration )
1062 + vst1.16 {d2[0]},[r2]! @; store 2 bytes (updated pixel x)
1063 + @; increment curr row pointer
1064 + bne paeth_filter_2bpp_loop
1065 +
1066 + b paeth_filter_DONE @; exit loop when
1067 + @; loop counter == rowbytes
1068 + #;; -------------------------------
1069 + #;; PAETH filter, 3 bytes per pixel
1070 + #;; -------------------------------
1071 +paeth_filter_3bpp:
1072 +
1073 + cmp r1, r0
1074 +
1075 + vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x + 1 ex tra
1076 + @; byte) from curr row into d0 [0]
1077 + vld1.32 {d1[0]},[r3],r12 @; load 4 bytes (pixel b + 1 ex tra
1078 + @; byte) from prev row into d1 [0]
1079 + @; increment prev row pointer
1080 + vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x
1081 + vst1.16 {d2[0]},[r2]! @; store 2 bytes (updated pixel x)
1082 + @; increment curr row pointer
1083 + vst1.8 {d2[2]},[r2]! @; store 1 byte (updated pixel x)
1084 + @; increment curr row pointer
1085 + beq paeth_filter_DONE
1086 +
1087 +paeth_filter_3bpp_loop:
1088 + add r1,r1,r12 @; loop counter += bpp
1089 + cmp r1,r0
1090 +
1091 +
1092 + #;; d1[0] = c (b in the previous loop iteration)
1093 + #;; d2[0] = a (x in the previous loop iteration)
1094 + vld1.32 {d3[0]},[r3],r12 @; load 4 bytes (pixel b + 1 ex tra
1095 + @; byte) from prev row into d3 [0]
1096 + vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x + 1 ex tra
1097 + @; byte) from curr row into d0 [0]
1098 + vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c
1099 + vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c)
1100 + vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c)
1101 + vaddl.u8 q5,d2,d3 @; q5 = a + b
1102 + vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c)
1103 + @
1104 + vcle.s16 q5,q2,q3 @; q5 = (pa <= pb)
1105 + vcle.s16 q6,q2,q4 @; q6 = (pa <= pc)
1106 + vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= p c))
1107 + vcle.s16 q7,q3,q4 @; q7 = (pb <= pc)
1108 + vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc))
1109 + vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc)
1110 + @
1111 + vand d2,d2,d10 @; d2 = a where 1, 0 where 0
1112 + vbsl d14,d3,d1 @; d14 = b where 1, c where 0
1113 + vmvn d10,d10 @; invert d10
1114 + vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0
1115 + vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropr iate
1116 + vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x)
1117 + vmov d1,d3 @; d1 = b (c for next iteration )
1118 + vst1.16 {d2[0]},[r2]! @; store 2 bytes (updated pixel x)
1119 + @; increment curr row pointer
1120 + vst1.8 {d2[2]},[r2]! @; store 1 byte (updated pixel x)
1121 + @; increment curr row pointer
1122 + bne paeth_filter_3bpp_loop
1123 +
1124 + b paeth_filter_DONE @; exit loop when
1125 + @; loop counter == rowbytes
1126 + #;; -------------------------------
1127 + #;; PAETH filter, 4 bytes per pixel
1128 + #;; -------------------------------
1129 +paeth_filter_4bpp:
1130 +
1131 + cmp r1, r0
1132 +
1133 + vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x) from curr
1134 + @; row into d0[0]
1135 + vld1.32 {d1[0]},[r3]! @; load 4 bytes (pixel b) from prev
1136 + @; row into d1[0]
1137 + @; increment prev row pointer
1138 + vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x
1139 + vst1.32 {d2[0]},[r2]! @; store 4 bytes (updated pixel x)
1140 + @; increment curr row pointer
1141 + beq paeth_filter_DONE
1142 +
1143 +paeth_filter_4bpp_loop:
1144 + add r1,r1,r12 @; loop counter += bpp
1145 + cmp r1,r0
1146 +
1147 +
1148 + #;; d1[0] = c (b in the previous loop iteration)
1149 + #;; d2[0] = a (x in the previous loop iteration)
1150 + vld1.32 {d3[0]},[r3]! @; load 4 bytes (pixel b) from prev
1151 + @; row into d3[0]
1152 + vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x) from curr
1153 + @; row into d0[0]
1154 + vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c
1155 + vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c)
1156 + vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c)
1157 + vaddl.u8 q5,d2,d3 @; q5 = a + b
1158 + vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c)
1159 + @
1160 + vcle.s16 q5,q2,q3 @; q5 = (pa <= pb)
1161 + vcle.s16 q6,q2,q4 @; q6 = (pa <= pc)
1162 + vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= p c))
1163 + vcle.s16 q7,q3,q4 @; q7 = (pb <= pc)
1164 + vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc))
1165 + vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc)
1166 + @
1167 + vand d2,d2,d10 @; d2 = a where 1, 0 where 0
1168 + vbsl d14,d3,d1 @; d14 = b where 1, c where 0
1169 + vmvn d10,d10 @; invert d10
1170 + vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0
1171 + vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropr iate
1172 + vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x)
1173 + vmov d1,d3 @; d1 = b (c for next iteration )
1174 + vst1.32 {d2[0]},[r2]! @; store 4 bytes (updated pixel x)
1175 + @; increment curr row pointer
1176 + bne paeth_filter_4bpp_loop
1177 +
1178 + b paeth_filter_DONE @; exit loop when
1179 + @; loop counter == rowbytes
1180 + #;; -------------------------------
1181 + #;; PAETH filter, 6 bytes per pixel
1182 + #;; -------------------------------
1183 +paeth_filter_6bpp:
1184 + cmp r1, r0
1185 +
1186 + vld1.8 {d0},[r2] @; load 8 bytes (pixel x + 2 ex tra
1187 + @; bytes) from curr row into d 0
1188 + vld1.8 {d1},[r3],r12 @; load 8 bytes (pixel b + 2 ex tra
1189 + @; bytes) from prev row into d 1
1190 + @; increment prev row pointer
1191 + vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x
1192 + vst1.32 {d2[0]},[r2]! @; store 4 bytes (updated pixel x)
1193 + @; increment curr row pointer
1194 + vst1.16 {d2[2]},[r2]! @; store 2 bytes (updated pixel x)
1195 + @; increment curr row pointer
1196 + beq paeth_filter_DONE
1197 +
1198 +paeth_filter_6bpp_loop:
1199 + add r1,r1,r12 @; loop counter += bpp
1200 + cmp r1,r0
1201 +
1202 +
1203 + #;; d1[0] = c (b in the previous loop iteration)
1204 + #;; d2[0] = a (x in the previous loop iteration)
1205 + vld1.8 {d3},[r3],r12 @; load 8 bytes (pixel b + 2 ex tra
1206 + @; bytes) from prev row into d 3
1207 + vld1.8 {d0},[r2] @; load 8 bytes (pixel x + 2 ex tra
1208 + @; bytes) from curr row into d 0
1209 + vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c
1210 + vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c)
1211 + vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c)
1212 + vaddl.u8 q5,d2,d3 @; q5 = a + b
1213 + vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c)
1214 +
1215 + vcle.s16 q5,q2,q3 @; q5 = (pa <= pb)
1216 + vcle.s16 q6,q2,q4 @; q6 = (pa <= pc)
1217 + vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= p c))
1218 + vcle.s16 q7,q3,q4 @; q7 = (pb <= pc)
1219 + vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc))
1220 + vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc)
1221 +
1222 + vand d2,d2,d10 @; d2 = a where 1, 0 where 0
1223 + vbsl d14,d3,d1 @; d14 = b where 1, c where 0
1224 + vmvn d10,d10 @; invert d10
1225 + vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0
1226 + vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropr iate
1227 + vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x)
1228 + vmov d1,d3 @; d1 = b (c for next iteration )
1229 + vst1.32 {d2[0]},[r2]! @; store 4 bytes (updated pixel x)
1230 + @; increment curr row pointer
1231 + vst1.16 {d2[2]},[r2]! @; store 2 bytes (updated pixel x)
1232 + @; increment curr row pointer
1233 + bne paeth_filter_6bpp_loop
1234 +
1235 + b paeth_filter_DONE @; exit loop when
1236 + @; loop counter == rowbytes
1237 + #;; -------------------------------
1238 + #;; PAETH filter, 8 bytes per pixel
1239 + #;; -------------------------------
1240 +paeth_filter_8bpp:
1241 + cmp r1, r0
1242 +
1243 + vld1.8 {d0},[r2] @; load 8 bytes (pixel x) from curr
1244 + @; row into d0
1245 + vld1.8 {d1},[r3]! @; load 8 bytes (pixel b) from prev
1246 + @; row into d1
1247 + @; increment prev row pointer
1248 + vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x
1249 + vst1.8 {d2},[r2]! @; store 8 bytes (updated pixel x)
1250 + @; increment curr row pointer
1251 + beq paeth_filter_DONE
1252 +
1253 +paeth_filter_8bpp_loop:
1254 + add r1,r1,r12 @; loop counter += bpp
1255 + cmp r1,r0
1256 +
1257 +
1258 + #;; d1[0] = c (b in the previous loop iteration)
1259 + #;; d2[0] = a (x in the previous loop iteration)
1260 + vld1.8 {d3},[r3]! @; load 8 bytes (pixel b) from prev
1261 + @; row into d3
1262 + vld1.8 {d0},[r2] @; load 8 bytes (pixel x) from curr
1263 + @; row into d0
1264 + vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c
1265 + vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c)
1266 + vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c)
1267 + vaddl.u8 q5,d2,d3 @; q5 = a + b
1268 + vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c)
1269 + @
1270 + vcle.s16 q5,q2,q3 @; q5 = (pa <= pb)
1271 + vcle.s16 q6,q2,q4 @; q6 = (pa <= pc)
1272 + vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= p c))
1273 + vcle.s16 q7,q3,q4 @; q7 = (pb <= pc)
1274 + vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc))
1275 + vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc)
1276 + @
1277 + vand d2,d2,d10 @; d2 = a where 1, 0 where 0
1278 + vbsl d14,d3,d1 @; d14 = b where 1, c where 0
1279 + vmvn d10,d10 @; invert d10
1280 + vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0
1281 + vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropr iate
1282 + vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x)
1283 + vmov d1,d3 @; d1 = b (c for next iteration )
1284 + vst1.8 {d2},[r2]! @; store 8 bytes (updated pixel x)
1285 + @; increment curr row pointer
1286 + bne paeth_filter_8bpp_loop
1287 +
1288 + b paeth_filter_DONE @; exit loop when
1289 + @; loop counter == rowbytes
1290 +paeth_filter_DONE:
1291 +
1292 + VPOP {q4-q7}
1293 + bx r14
1294 +
1295 +DONE:
1296 + bx r14
1297 +
1298 +
1299 +.size png_read_filter_row_neon, .-png_read_filter_row_neon
1300 + .END
1301 +#endif
1302 diff --git a/pngrutil.c b/pngrutil.c
1303 index 1e2db31..adfffb2 100755
1304 --- a/pngrutil.c
1305 +++ b/pngrutil.c
1306 @@ -23,6 +23,10 @@
1307 # define WIN32_WCE_OLD
1308 #endif
1309
1310 +#if defined(__ARM_NEON__)
1311 +extern void png_read_filter_row_neon(png_uint_32 rowbytes, png_byte pixel_depth , png_bytep row, png_bytep prev_row, int filter);
1312 +#endif
1313 +
1314 #ifdef PNG_FLOATING_POINT_SUPPORTED
1315 # ifdef WIN32_WCE_OLD
1316 /* The strtod() function is not supported on WindowsCE */
1317 @@ -2928,6 +2932,9 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row _info, png_bytep row,
1318 {
1319 png_debug(1, "in png_read_filter_row");
1320 png_debug2(2, "row = %lu, filter = %d", png_ptr->row_number, filter);
1321 +#if defined(__ARM_NEON__)
1322 + png_read_filter_row_neon(row_info->rowbytes, row_info->pixel_depth, row, pre v_row, filter);
1323 +#else
1324 switch (filter)
1325 {
1326 case PNG_FILTER_VALUE_NONE:
1327 @@ -3043,6 +3050,7 @@ png_read_filter_row(png_structp png_ptr, png_row_infop row _info, png_bytep row,
1328 *row = 0;
1329 break;
1330 }
1331 +#endif /* #if defined(__ARM_NEON__) */
1332 }
1333
1334 #ifdef PNG_SEQUENTIAL_READ_SUPPORTED
OLDNEW
« no previous file with comments | « no previous file | media-libs/libpng/libpng-1.2.44.ebuild » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698