Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(519)

Side by Side Diff: sys-libs/zlib/files/zlib-1.2.3-neon-optimized.patch

Issue 5176006: Applying Neon optimization patch to the ZLIB library. Base URL: http://git.chromium.org/git/portage.git@master
Patch Set: Created 10 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | sys-libs/zlib/zlib-1.2.3-r1.ebuild » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 commit df426dcc3179a3647a695c0fde66e1b3616fa6c0
2 Author: John Alayari <jalayari@codeaurora.org>
3 Date: Thu Nov 18 16:24:17 2010 -0800
4
5 commiting the PNG and ZLIB Neon Optimizations files to local git repsitory.
6
7 diff --git a/Makefile.in b/Makefile.in
8 index 7da5a85..276c531 100644
9 --- a/Makefile.in
10 +++ b/Makefile.in
11 @@ -51,7 +51,7 @@ OBJS = adler32.o compress.o crc32.o gzio.o uncompr.o deflate.o trees.o \
12
13 PIC_OBJS = $(OBJS:%.o=%.lo)
14
15 -OBJA =
16 +OBJA =inflate_fast_copy_neon.o adler32_DO16_loop_neon.o
17 # to use the asm code: make OBJA=match.o
18
19 TEST_OBJS = example.o minigzip.o
20 @@ -82,8 +82,12 @@ match.o: match.S
21 %.lo: %.c
22 $(CC) $(CFLAGS) -DPIC -fPIC -c $< -o $@
23
24 -$(SHAREDLIBV): $(PIC_OBJS)
25 - $(LDSHARED) -o $@ $(PIC_OBJS) -lc $(LDFLAGS)
26 +%.o: %.S
27 + $(CC) $(CFLAGS) -DPIC -fPIC -c $< -o $@
28 +
29 +
30 +$(SHAREDLIBV): $(PIC_OBJS) $(OBJA)
31 + $(LDSHARED) -o $@ $(PIC_OBJS) $(OBJA) -lc $(LDFLAGS)
32 rm -f $(SHAREDLIB) $(SHAREDLIBM)
33 ln -s $@ $(SHAREDLIB)
34 ln -s $@ $(SHAREDLIBM)
35 diff --git a/adler32.c b/adler32.c
36 index 007ba26..a256e88 100644
37 --- a/adler32.c
38 +++ b/adler32.c
39 @@ -1,5 +1,6 @@
40 /* adler32.c -- compute the Adler-32 checksum of a data stream
41 * Copyright (C) 1995-2004 Mark Adler
42 + * Copyright (c) 2010, Code Aurora Forum. All rights reserved.
43 * For conditions of distribution and use, see copyright notice in zlib.h
44 */
45
46 @@ -18,6 +19,10 @@
47 #define DO8(buf,i) DO4(buf,i); DO4(buf,i+4);
48 #define DO16(buf) DO8(buf,0); DO8(buf,8);
49
50 +#if defined(__ARM_NEON__)
51 +extern void adler32_DO16_loop_neon(unsigned char** , unsigned long *, unsigned long * , int );
52 +#endif
53 +
54 /* use NO_DIVIDE if your processor does not do division in hardware */
55 #ifdef NO_DIVIDE
56 # define MOD(a) \
57 @@ -96,17 +101,25 @@ uLong ZEXPORT adler32(adler, buf, len)
58 /* do length NMAX blocks -- requires just one modulo operation */
59 while (len >= NMAX) {
60 len -= NMAX;
61 +#if defined(__ARM_NEON__)
62 + adler32_DO16_loop_neon(&buf, &adler, &sum2, NMAX);
63 +#else
64 n = NMAX / 16; /* NMAX is divisible by 16 */
65 do {
66 DO16(buf); /* 16 sums unrolled */
67 buf += 16;
68 } while (--n);
69 +#endif
70 MOD(adler);
71 MOD(sum2);
72 }
73
74 /* do remaining bytes (less than NMAX, still just one modulo) */
75 if (len) { /* avoid modulos if none remaining */
76 +
77 +#if defined(__ARM_NEON__)
78 + adler32_DO16_loop_neon(&buf, &adler, &sum2, len);
79 +#else
80 while (len >= 16) {
81 len -= 16;
82 DO16(buf);
83 @@ -116,6 +129,7 @@ uLong ZEXPORT adler32(adler, buf, len)
84 adler += *buf++;
85 sum2 += adler;
86 }
87 +#endif
88 MOD(adler);
89 MOD(sum2);
90 }
91 diff --git a/adler32_DO16_loop_neon.S b/adler32_DO16_loop_neon.S
92 new file mode 100755
93 index 0000000..1ba5147
94 --- /dev/null
95 +++ b/adler32_DO16_loop_neon.S
96 @@ -0,0 +1,195 @@
97 +#
98 +# Copyright (c) 2010, Code Aurora Forum. All rights reserved.
99 +#
100 +# Redistribution and use in source and binary forms, with or without
101 +# modification, are permitted provided that the following conditions
102 +# are met:
103 +# * Redistributions of source code must retain the above copyright
104 +# notice, this list of conditions and the following disclaimer.
105 +# * Redistributions in binary form must reproduce the above
106 +# copyright notice, this list of conditions and the following
107 +# disclaimer in the documentation and/or other materials provided
108 +# with the distribution.
109 +# * Neither the name of Code Aurora Forum, Inc. nor the names of it
110 +# contributors may be used to endorse or promote products derived
111 +# from this software without specific prior written permission.
112 +#
113 +# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
114 +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
115 +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMEN
116 +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTOR
117 +# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, O
118 +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
119 +# SUBSTITUTE GOODS OR SERVICES LOSS OF USE, DATA, OR PROFITS OR
120 +# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
121 +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
122 +# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVE
123 +# IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
124 +#
125 +#if defined(__ARM_NEON__)
126 +#======================================================================
127 +
128 +#======================================================================
129 +# Code Section
130 +
131 + .code 32 @ Code is ARM ISA
132 +
133 + .global adler32_DO16_loop_neon
134 +
135 +#======================================================================
136 +# Function Name(s) : adler32_DO16_loop_neon
137 +#
138 +# Function Parameters
139 +# r0 = pointer to buf
140 +# r1 = pointer to adler
141 +# r2 = pointer to sum
142 +# r3 = len
143 +#
144 +# Register Usage
145 +# q0, q1 = input data
146 +# d18,d24 = adler
147 +# d16,d28 = sum
148 +# d30 = multiplication factor
149 +#
150 +#======================================================================
151 +#
152 +# algorithm:
153 +#
154 +# while (length < loop_counter)
155 +# do
156 +# length = length - loop_counter
157 +# for i = 0 to loop_counter
158 +# adler = adler_begin + input[i]
159 +# for i = 0 to loop_counter
160 +# sum2 = sum2_begin+(loop_counter-i)*input[i]
161 +# sum2 = sum2+(adler_begin*loop_counter)
162 +# adler_begin = adler
163 +# sum2_begin = sum2
164 +# end
165 +# end
166 +# end
167 +#
168 +# Here loop counter holds values of 16, 8 and 1 to compute
169 +# adler and sum for 16 bytes, 8 bytes and 1 byte at a time
170 +# adler_begin and sum2_begin are used to hold the values
171 +# of adler and sum2 from previous iterations.
172 +#
173 +#======================================================================
174 +.balign 32
175 +.type adler32_DO16_loop_neon, %function
176 +
177 +adler32_DO16_loop_neon:
178 + stmdb sp!,{r4-r7}
179 + ldr r7,[r0]
180 + vld1.32 {d18[0]},[r1] @load the input adler
181 + vld1.32 {d16[0]},[r2] @load the input sum
182 + ldr r5,=FACTOR16 @load the multiplication
183 + @factors for data elements
184 + vld1.8 {d20,d21},[r5] @load the multiplication
185 + @factor for adler.
186 + lsrs r4,r3,#4 @Calculate the number
187 + @16 byte iterations
188 + beq adler32_DO16_loop_16bytes_done
189 + mov r6, #16
190 + vmov.32 d30[0],r6
191 +
192 +adler32_DO16_loop_16bytes:
193 + vld1.8 {d0,d1},[r7]! @load buf[0]..buf[15]
194 + vpaddl.u8 d6, d0 @pair wise add to reduce
195 + @8 elements to 4 and extend.
196 + vpaddl.u8 d7,d1 @pair wise add to reduce 8
197 + @elements to 4 and extend.
198 + vpadd.u16 d24,d6,d7 @pair wise add (i.e. no
199 + @need to extend 16 bits
200 + @sufficient to hold the sum).
201 + vpaddl.u16 d24,d24 @pair wise add to reduce 4
202 + @elements to 2 and extend.
203 + vpaddl.u32 d24,d24 @pair wise add to get the
204 + @adler of 16 inputs no need
205 + @to extend .. but only vpaddl
206 + @adds pair wise on one
207 + @doubleword.
208 + vadd.u32 d24,d18,d24 @adler'=adler+adler_of_16_inputs
209 + vmull.u8 q13,d20,d0 @sum'=mul_fac_for_inputs[0...7]
210 + @ * buf[0..7].
211 + vmlal.u8 q13,d21,d1 @sum'=sum'+ mul_fac_for_inputs
212 + @[8...15] * buf[8..15].
213 + vpadd.u16 d28,d26,d27 @pair wise add the doublewords
214 + vpaddl.u16 d28,d28 @pair wise add to reduce 4
215 + @elements to 2 and extend.
216 + vpaddl.u32 d28,d28 @pair wise add
217 + vadd.u32 d28,d16,d28 @sum' = sum + sum'
218 + vmla.u32 d28,d18,d30 @sum' = sum' + (adler*
219 + @mul_fac_for_adler).
220 + vmov.u32 d18,d24 @save adler for next iteration.
221 + vmov.u32 d16,d28 @save sum for next iteration.
222 + sub r3,r3,#16
223 + subs r4,r4,#1
224 + bne adler32_DO16_loop_16bytes
225 +
226 +adler32_DO16_loop_16bytes_done:
227 + lsrs r4, r3, #3 @find if there are atleast 8 bytes
228 + beq adler32_DO16_loop_8bytes_done
229 +adler32_DO16_loop_8bytes:
230 + vld1.8 {d0},[r7]! @load buf[0] .buf[7]
231 + vpaddl.u8 d24,d0 @pair wise add to
232 + @reduce 8 elements to 4
233 + vpaddl.u16 d24,d24 @pair wise add to reduce
234 + @4 elements to 2
235 + vpaddl.u32 d24,d24 @pair wise add to get the
236 + @adler for 8 inputs
237 + vadd.u32 d24,d18,d24 @adler' = adler +
238 + @adler_for_8_inputs.
239 + vmull.u8 q13,d21,d0 @sum' = mul_fac_for_inputs[0..7]
240 + @ * buf[0..7]
241 + vpadd.u16 d28,d26,d27 @pair wise add to reduce 8
242 + @elements to 4
243 + vpaddl.u16 d28,d28 @pair wise add to reduce 4
244 + @elements to 2.
245 + vpaddl.u32 d28,d28 @pair wise add
246 + vadd.u32 d28,d16,d28 @sum' = sum + sum'
247 + vshl.u32 d30,d18,#3 @adler" = adler * 8
248 + vadd.u32 d28,d28,d30 @sum' = sum' + adler"
249 + vmov.u32 d18,d24 @save adler for next iteration
250 + vmov.u32 d16,d28 @save sum for next iteration
251 + sub r3,r3,#8
252 +
253 +adler32_DO16_loop_8bytes_done:
254 + cmp r3, #0 @find if there are remaining bytes
255 + beq DONE @after profiling found that a loop
256 + @to compute 4 or 2 bytes at a time
257 + @is less efficient than a byte by
258 + @byte computation.
259 + vmov.u64 d3, #0
260 +
261 +adler32_DO16_loop_remaining:
262 + vld1.8 {d3[0]}, [r7]! @load 1 byte of input
263 + subs r3,r3,#1
264 + vadd.u32 d24,d3,d18 @adler' = adler + *buf
265 + vadd.u32 d28,d24,d16 @sum' = sum + adler'
266 + vmov.u32 d18,d24
267 + vmov.u32 d16,d28
268 + bne adler32_DO16_loop_remaining
269 +
270 +DONE:
271 + vst1.32 {d24[0]},[r1]
272 + vst1.32 {d28[0]},[r2]
273 + str r7, [r0]
274 + ldmia sp!, {r4-r7}
275 + bx lr
276 +
277 +.size adler32_DO16_loop_neon, .-adler32_DO16_loop_neon
278 +
279 +.balign 16
280 +#======================================================================
281 +#FACTOR16 provides the multiplication factors for the inputs for 16
282 +#byte loops. The second half (i.e. 8 to 1) has the multiplications
283 +#factors for 8 byte loops.
284 +#======================================================================
285 +
286 +FACTOR16:
287 + .byte 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
288 +
289 +#endif
290 + .END
291 +
292 diff --git a/inffast.c b/inffast.c
293 index bbee92e..943cb92 100644
294 --- a/inffast.c
295 +++ b/inffast.c
296 @@ -8,6 +8,10 @@
297 #include "inflate.h"
298 #include "inffast.h"
299
300 +#if defined(__ARM_NEON__)
301 +extern void inflate_fast_copy_neon(unsigned len, unsigned char **out, unsigned char *from);
302 +#endif
303 +
304 #ifndef ASMINF
305
306 /* Allow machine dependent optimization for post-increment or pre-increment.
307 @@ -231,6 +235,9 @@ unsigned start; /* inflate()'s starting value for st rm->avail_out */
308 from = out - dist; /* rest from output */
309 }
310 }
311 +#if defined(__ARM_NEON__)
312 + inflate_fast_copy_neon(len, &out, from);
313 +#else
314 while (len > 2) {
315 PUP(out) = PUP(from);
316 PUP(out) = PUP(from);
317 @@ -242,9 +249,13 @@ unsigned start; /* inflate()'s starting value for s trm->avail_out */
318 if (len > 1)
319 PUP(out) = PUP(from);
320 }
321 +#endif
322 }
323 else {
324 from = out - dist; /* copy direct from output */
325 +#if defined(__ARM_NEON__)
326 + inflate_fast_copy_neon(len, &out, from);
327 +#else
328 do { /* minimum length is three */
329 PUP(out) = PUP(from);
330 PUP(out) = PUP(from);
331 @@ -256,6 +267,7 @@ unsigned start; /* inflate()'s starting value for st rm->avail_out */
332 if (len > 1)
333 PUP(out) = PUP(from);
334 }
335 +#endif
336 }
337 }
338 else if ((op & 64) == 0) { /* 2nd level distance code */
339 diff --git a/inflate_fast_copy_neon.S b/inflate_fast_copy_neon.S
340 new file mode 100755
341 index 0000000..ec1e4ab
342 --- /dev/null
343 +++ b/inflate_fast_copy_neon.S
344 @@ -0,0 +1,521 @@
345 +#; Copyright (c) 2010, Code Aurora Forum. All rights reserved.
346 +#;
347 +#; Redistribution and use in source and binary forms, with or without
348 +#; modification, are permitted provided that the following conditions are
349 +#; met:
350 +#; * Redistributions of source code must retain the above copyright
351 +#; notice, this list of conditions and the following disclaimer.
352 +#; * Redistributions in binary form must reproduce the above
353 +#; copyright notice, this list of conditions and the following
354 +#; disclaimer in the documentation and/or other materials provided
355 +#; with the distribution.
356 +#; * Neither the name of Code Aurora Forum, Inc. nor the names of its
357 +#; contributors may be used to endorse or promote products derived
358 +#; from this software without specific prior written permission.
359 +#;
360 +#; THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
361 +#; WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
362 +#; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT
363 +#; ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
364 +#; BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
365 +#; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
366 +#; SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
367 +#; BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
368 +#; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
369 +#; OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
370 +#; IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
371 +#;
372 +#;
373 +#if defined(__ARM_NEON__)
374 +#;============================================================================
375 +#; Code Section
376 + .code 32 @; Code is ARM ISA
377 +#;============================================================================
378 +
379 + .global inflate_fast_copy_neon
380 +
381 +
382 +#;============================================================================
383 +#; INPUTS: r0 len: number of bytes to transfer
384 +#; r1 **out: pointer to pointer to ``out'' buffer
385 +#; r2 *from: pointer to ``from'' buffer
386 +#; OUTPUTS: r1 **out: pointer to pointer to ``out'' buffer
387 +#;============================================================================
388 +.balign 32
389 +.type inflate_fast_copy_neon, %function
390 +inflate_fast_copy_neon:
391 + push {r4-r11} @; push r4-r11 onto stack
392 +
393 + cmp r0,#16 @;
394 + bge inflate_fast_copy_vectorized
395 +
396 + #;; transfer bytes one by one
397 + #;; only if len < 16 bytes
398 +inflate_fast_copy_default:
399 +
400 + cmp r0,#0
401 + beq inflate_fast_copy_exit
402 +
403 + ldr r3,[r1,#0] @; r3 = pointer to out
404 +
405 +inflate_fast_copy_default_loop:
406 +
407 + ldrb r12,[r2,#1]! @; r12 = *(++from)
408 + subs r0,r0,#1 @; len--
409 + strb r12,[r3,#1]! @; *(++out) = r12
410 +
411 + bne inflate_fast_copy_default_loop
412 +
413 + str r3,[r1,#0] @; r1 = updated pointer to pointer
414 + @; to out
415 + b inflate_fast_copy_exit
416 +
417 + #;; vectorized copy routines
418 + #;; only if len > 16 bytes
419 +inflate_fast_copy_vectorized:
420 +
421 + ldr r3,[r1,#0] @; r3 = pointer to out
422 + @; DON'T TOUCH r1 UNTIL FINAL
423 + @; UPDATE OF r1 WITH ADDRESS OF r3
424 + cmp r3,r2 @
425 + sublt r4,r2,r3 @
426 + subge r4,r3,r2 @;r4 = gap = |out-from|
427 +
428 + cmp r4,#0
429 + beq inflate_fast_copy_exit
430 +
431 + cmp r4,#1
432 + beq inflate_fast_copy_gap1b_proc
433 +
434 + cmp r4,#2
435 + beq inflate_fast_copy_gap2b_proc
436 +
437 + cmp r4,#3
438 + beq inflate_fast_copy_gap3b_proc
439 +
440 + cmp r4,#4
441 + beq inflate_fast_copy_gap4b_proc
442 +
443 + cmp r4,#8
444 + blt inflate_fast_copy_gap5to7b_proc
445 + beq inflate_fast_copy_gap8b_proc
446 +
447 + cmp r4,#16
448 + blt inflate_fast_copy_gap9to15b_proc
449 + bge inflate_fast_copy_gap16b_proc
450 +
451 +
452 + #;; ------------------------------------------------------------------
453 + #;; vectorized copy routine when gap between ``from'' and ``out''
454 + #;; buffers is 1 byte
455 + #;; INPUTS:
456 + #;; r0 = len
457 + #;; r2 = pointer to from
458 + #;; r3 = pointer to out
459 + #;; OUTPUTS:
460 + #;; r1 = pointer to pointer to out
461 + #;; ------------------------------------------------------------------
462 +inflate_fast_copy_gap1b_proc:
463 +
464 + add r3,r3,#1 @; out++
465 + @
466 + ldrb r12,[r2,#1]! @; r12 = *(++from)
467 + vdup.8 q0, r12 @; duplicate r12 16 times in q0
468 + @
469 + lsrs r4,r0,#4 @; r4 = floor(len/16)
470 + @; = iteration count for loop16
471 + beq inflate_fast_copy_gap1b_proc_16bytes_loop_done
472 +
473 +inflate_fast_copy_gap1b_proc_16bytes_loop:
474 +
475 + vst1.8 {q0},[r3]! @; store 16 bytes in out and
476 + @; increment out pointer
477 + sub r0,r0,#16 @; subtract 16 from len
478 + subs r4,r4,#1 @; decrement iteration count
479 + bne inflate_fast_copy_gap1b_proc_16bytes_loop
480 +
481 +inflate_fast_copy_gap1b_proc_16bytes_loop_done:
482 +
483 + cmp r0,#0
484 + subeq r3,r3,#1 @; out--
485 + streq r3,[r1,#0] @; r1 = updated pointer to pointer
486 + @; to out
487 + beq inflate_fast_copy_exit
488 +
489 +inflate_fast_copy_gap1b_proc_lastfewbytes_loop:
490 +
491 + strb r12,[r3],#1 @; *out = r12, out++
492 + subs r0,r0,#1 @; len--
493 + bne inflate_fast_copy_gap1b_proc_lastfewbytes_loop
494 +
495 + sub r3,r3,#1 @; out--
496 + str r3,[r1,#0] @; r1 = updated pointer to pointer
497 + @; to out
498 + b inflate_fast_copy_exit
499 +
500 + #;; ------------------------------------------------------------------
501 + #;; vectorized copy routine when gap between ``from'' and ``out''
502 + #;; buffers is 2 bytes
503 + #;; INPUTS:
504 + #;; r0 = len
505 + #;; r2 = pointer to from
506 + #;; r3 = pointer to out
507 + #;; OUTPUTS:
508 + #;; r1 = pointer to pointer to out
509 + #;; ------------------------------------------------------------------
510 +inflate_fast_copy_gap2b_proc:
511 +
512 + add r2,r2,#1 @; from++
513 + add r3,r3,#1 @; out++
514 + @
515 + vld1.16 {d0[0]},[r2] @; load 2 bytes into d0[0]
516 + vdup.16 q0,d0[0] @; duplicate those 2 bytes 8 times
517 + @; to fill up q0
518 + @
519 + lsrs r4,r0,#4 @; r4 = floor(len/16)
520 + @; = iteration count for loop16
521 + beq inflate_fast_copy_gap2b_proc_16bytes_loop_done
522 +
523 +inflate_fast_copy_gap2b_proc_16bytes_loop:
524 +
525 + vst1.8 {q0},[r3]! @; store 16 bytes in out and
526 + @; increment out pointer
527 + sub r0,r0,#16 @; subtract 16 from len
528 + subs r4,r4,#1 @; decrement iteration count
529 + bne inflate_fast_copy_gap2b_proc_16bytes_loop
530 +
531 +inflate_fast_copy_gap2b_proc_16bytes_loop_done:
532 +
533 + cmp r0,#0
534 + subeq r3,r3,#1 @; out--
535 + streq r3,[r1,#0] @; r1 = updated pointer to pointer
536 + @; to out
537 + beq inflate_fast_copy_exit
538 +
539 +inflate_fast_copy_gap2b_proc_lastfewbytes_loop:
540 +
541 + ldrb r12,[r2],#1 @; r12 = *from, from++
542 + subs r0,r0,#1 @; len--
543 + strb r12,[r3],#1 @; *out = r12, out++
544 + @
545 + bne inflate_fast_copy_gap2b_proc_lastfewbytes_loop
546 +
547 + sub r3,r3,#1 @; out--
548 + str r3,[r1,#0] @; r1 = updated pointer to pointer
549 + @; to out
550 + b inflate_fast_copy_exit
551 +
552 + #;; ------------------------------------------------------------------
553 + #;; vectorized copy routine when gap between ``from'' and ``out''
554 + #;; buffers is 3 bytes
555 + #;; INPUTS:
556 + #;; r0 = len
557 + #;; r2 = pointer to from
558 + #;; r3 = pointer to out
559 + #;; r4 = 3
560 + #;; OUTPUTS:
561 + #;; r1 = pointer to pointer to out
562 + #;; ------------------------------------------------------------------
563 +inflate_fast_copy_gap3b_proc:
564 +
565 + add r2,r2,#1 @; from++
566 + add r3,r3,#1 @; out++
567 + @
568 + vld1.32 {d0[0]},[r2] @; load 4 bytes into d0[0]
569 +
570 +inflate_fast_copy_gap3b_proc_3bytes_loop:
571 +
572 + cmp r0,#3 @; exit loop if len < 3
573 + blt inflate_fast_copy_gap3b_proc_3bytes_loop_done
574 +
575 + vst1.32 {d0[0]},[r3],r4 @; store 4 bytes in out
576 + @; out+=3
577 +
578 + sub r0,r0,#3 @; len-=3
579 + b inflate_fast_copy_gap3b_proc_3bytes_loop
580 +
581 +inflate_fast_copy_gap3b_proc_3bytes_loop_done:
582 +
583 + cmp r0,#0
584 + subeq r3,r3,#1 @; out--
585 + streq r3,[r1,#0] @; r1 = updated pointer to pointer
586 + @; to out
587 + beq inflate_fast_copy_exit
588 +
589 +inflate_fast_copy_gap3b_proc_lastfewbytes_loop:
590 +
591 + ldrb r12,[r2],#1 @; r12 = *from, from++
592 + subs r0,r0,#1 @; len--
593 + strb r12,[r3],#1 @; *out = r12, out++
594 +
595 + bne inflate_fast_copy_gap3b_proc_lastfewbytes_loop
596 +
597 + sub r3,r3,#1 @; out--
598 + str r3,[r1,#0] @; r1 = updated pointer to pointer
599 + @; to out
600 + b inflate_fast_copy_exit
601 +
602 + #;; ------------------------------------------------------------------
603 + #;; vectorized copy routine when gap between ``from'' and ``out''
604 + #;; buffers is 4 bytes
605 + #;; INPUTS:
606 + #;; r0 = len
607 + #;; r2 = pointer to from
608 + #;; r3 = pointer to out
609 + #;; OUTPUTS:
610 + #;; r1 = pointer to pointer to out
611 + #;; ------------------------------------------------------------------
612 +inflate_fast_copy_gap4b_proc:
613 +
614 + add r2,r2,#1 @; from++
615 + add r3,r3,#1 @; out++
616 + @
617 + vld1.32 {d0[0]},[r2] @; load 4 bytes into d0[0]
618 + vdup.32 q0,d0[0] @; duplicate those 4 bytes 4 times
619 + @; to fill up q0
620 + @
621 + lsrs r4,r0,#4 @; r4 = floor(len/16)
622 + @; = iteration count for loop16
623 + beq inflate_fast_copy_gap4b_proc_16bytes_loop_done
624 +
625 +inflate_fast_copy_gap4b_proc_16bytes_loop:
626 +
627 + vst1.32 {q0},[r3]! @; store 16 bytes in out and
628 + @; increment out pointer
629 + sub r0,r0,#16 @; subtract 16 from len
630 + subs r4,r4,#1 @; decrement iteration count
631 + bne inflate_fast_copy_gap4b_proc_16bytes_loop
632 +
633 +inflate_fast_copy_gap4b_proc_16bytes_loop_done:
634 +
635 + cmp r0,#0
636 + subeq r3,r3,#1 @; out--
637 + streq r3,[r1,#0] @; r1 = updated pointer to pointer
638 + @; to out
639 + beq inflate_fast_copy_exit
640 +
641 +inflate_fast_copy_gap4b_proc_lastfewbytes_loop:
642 +
643 + ldrb r12,[r2],#1 @; r12 = *from, from++
644 + subs r0,r0,#1 @; len--
645 + strb r12,[r3],#1 @; *out = r12, out++
646 +
647 + bne inflate_fast_copy_gap4b_proc_lastfewbytes_loop
648 +
649 + sub r3,r3,#1 @; out--
650 + str r3,[r1,#0] @; r1 = updated pointer to pointer
651 + @; to out
652 + b inflate_fast_copy_exit
653 +
654 + #;; ------------------------------------------------------------------
655 + #;; vectorized copy routine when gap between ``from'' and ``out''
656 + #;; buffers is {5-7} bytes
657 + #;; INPUTS:
658 + #;; r0 = len
659 + #;; r2 = pointer to from
660 + #;; r3 = pointer to out
661 + #;; r4 = {5-7}
662 + #;; OUTPUTS:
663 + #;; r1 = pointer to pointer to out
664 + #;; ------------------------------------------------------------------
665 +inflate_fast_copy_gap5to7b_proc:
666 +
667 + add r2,r2,#1 @; from++
668 + add r3,r3,#1 @; out++
669 + @
670 + vld1.8 {d0},[r2] @; load 8 bytes into d0
671 +
672 +inflate_fast_copy_gap5to7b_proc_5to7bytes_loop:
673 +
674 + cmp r0,r4 @; exit loop if len < {5-7}
675 + blt inflate_fast_copy_gap5to7b_proc_5to7bytes_loop_done
676 +
677 + vst1.8 {d0},[r3],r4 @; store 8 bytes in out
678 + @; out+={5-7}
679 +
680 + sub r0,r0,r4 @; len-={5-7}
681 + b inflate_fast_copy_gap5to7b_proc_5to7bytes_loop
682 +
683 +inflate_fast_copy_gap5to7b_proc_5to7bytes_loop_done:
684 +
685 + cmp r0,#0
686 + subeq r3,r3,#1 @; out--
687 + streq r3,[r1,#0] @; r1 = updated pointer to pointer
688 + @; to out
689 + beq inflate_fast_copy_exit
690 +
691 +inflate_fast_copy_gap5to7b_proc_lastfewbytes_loop:
692 +
693 + ldrb r12,[r2],#1 @; r12 = *from, from++
694 + subs r0,r0,#1 @; len--
695 + strb r12,[r3],#1 @; *out = r12, out++
696 +
697 + bne inflate_fast_copy_gap5to7b_proc_lastfewbytes_loop
698 +
699 + sub r3,r3,#1 @; out--
700 + str r3,[r1,#0] @; r1 = updated pointer to pointer
701 + @; to out
702 + b inflate_fast_copy_exit
703 +
704 + #;; ------------------------------------------------------------------
705 + #;; vectorized copy routine when gap between ``from'' and ``out''
706 + #;; buffers is 8 bytes
707 + #;; INPUTS:
708 + #;; r0 = len
709 + #;; r2 = pointer to from
710 + #;; r3 = pointer to out
711 + #;; OUTPUTS:
712 + #;; r1 = pointer to pointer to out
713 + #;; ------------------------------------------------------------------
714 +inflate_fast_copy_gap8b_proc:
715 +
716 + add r2,r2,#1 @; from++
717 + add r3,r3,#1 @; out++
718 + @
719 + vld1.8 {d0},[r2] @; load 8 bytes into d0
720 + vmov d1,d0 @; duplicate the 8 bytes to fill up
721 + @; q0
722 + @
723 + lsrs r4,r0,#4 @; r4 = floor(len/16)
724 + @; = iteration count for loop16
725 + beq inflate_fast_copy_gap8b_proc_16bytes_loop_done
726 +
727 +inflate_fast_copy_gap8b_proc_16bytes_loop:
728 +
729 + vst1.8 {q0},[r3]! @; store 16 bytes in out and
730 + @; increment out pointer
731 + sub r0,r0,#16 @; subtract 16 from len
732 + subs r4,r4,#1 @; decrement iteration count
733 + bne inflate_fast_copy_gap8b_proc_16bytes_loop
734 +
735 +inflate_fast_copy_gap8b_proc_16bytes_loop_done:
736 +
737 + cmp r0,#0
738 + subeq r3,r3,#1 @; out--
739 + streq r3,[r1,#0] @; r1 = updated pointer to pointer
740 + @; to out
741 + beq inflate_fast_copy_exit
742 +
743 +inflate_fast_copy_gap8b_proc_lastfewbytes_loop:
744 +
745 + ldrb r12,[r2],#1 @; r12 = *from, from++
746 + subs r0,r0,#1 @; len--
747 + strb r12,[r3],#1 @; *out = r12, out++
748 +
749 + bne inflate_fast_copy_gap8b_proc_lastfewbytes_loop
750 +
751 + sub r3,r3,#1 @; out--
752 + str r3,[r1,#0] @; r1 = updated pointer to pointer
753 + @; to out
754 + b inflate_fast_copy_exit
755 +
756 + #;; ------------------------------------------------------------------
757 + #;; vectorized copy routine when gap between ``from'' and ``out''
758 + #;; buffers is {9-15} bytes
759 + #;; INPUTS:
760 + #;; r0 = len
761 + #;; r2 = pointer to from
762 + #;; r3 = pointer to out
763 + #;; r4 = {9-15}
764 + #;; OUTPUTS:
765 + #;; r1 = pointer to pointer to out
766 + #;; ------------------------------------------------------------------
767 +inflate_fast_copy_gap9to15b_proc:
768 +
769 + add r2,r2,#1 @; from++
770 + add r3,r3,#1 @; out++
771 + @
772 + vld1.8 {q0},[r2] @; load 16 bytes into q0
773 +
774 +inflate_fast_copy_gap9to15b_proc_9to15bytes_loop:
775 +
776 + cmp r0, r4 @; exit loop if len < {9-15}
777 + blt inflate_fast_copy_gap9to15b_proc_9to15bytes_loop_done
778 +
779 + vst1.8 {q0},[r3],r4 @; store 16 bytes in out
780 + @; out+={9-15}
781 +
782 + sub r0,r0,r4 @; len-={9-15}
783 + b inflate_fast_copy_gap9to15b_proc_9to15bytes_loop
784 +
785 +inflate_fast_copy_gap9to15b_proc_9to15bytes_loop_done:
786 +
787 + cmp r0,#0
788 + subeq r3,r3,#1 @; out--
789 + streq r3,[r1,#0] @; r1 = updated pointer to pointer
790 + @; to out
791 + beq inflate_fast_copy_exit
792 +
793 +inflate_fast_copy_gap9to15b_proc_lastfewbytes_loop:
794 +
795 + ldrb r12,[r2],#1 @; r12 = *from, from++
796 + subs r0,r0,#1 @; len--
797 + strb r12,[r3],#1 @; *out = r12, out++
798 +
799 + bne inflate_fast_copy_gap9to15b_proc_lastfewbytes_loop
800 +
801 + sub r3,r3,#1 @; out--
802 + str r3,[r1,#0] @; r1 = updated pointer to pointer
803 + @; to out
804 + b inflate_fast_copy_exit
805 +
806 + #;; ------------------------------------------------------------------
807 + #;; vectorized copy routine when gap between ``from'' and ``out''
808 + #;; buffers is 16 bytes or more
809 + #;; INPUTS:
810 + #;; r0 = len
811 + #;; r2 = pointer to from
812 + #;; r3 = pointer to out
813 + #;; OUTPUTS:
814 + #;; r1 = pointer to pointer to out
815 + #;; ------------------------------------------------------------------
816 +inflate_fast_copy_gap16b_proc:
817 +
818 + add r2,r2,#1 @; from++
819 + add r3,r3,#1 @; out++
820 + @
821 + lsrs r4,r0,#4 @; r4 = floor(len/16)
822 + @; = iteration count for loop16
823 + beq inflate_fast_copy_gap16b_proc_16bytes_loop_done
824 +
825 +inflate_fast_copy_gap16b_proc_16bytes_loop:
826 +
827 + vld1.8 {q0},[r2]! @; load 16 bytes into q0 and
828 + @; increment from pointer
829 + vst1.8 {q0},[r3]! @; store 16 bytes in out and
830 + @; increment out pointer
831 + sub r0,r0,#16 @; subtract 16 from len
832 + subs r4,r4,#1 @; decrement iteration count
833 + bne inflate_fast_copy_gap16b_proc_16bytes_loop
834 +
835 +inflate_fast_copy_gap16b_proc_16bytes_loop_done:
836 +
837 + cmp r0,#0
838 + subeq r3,r3,#1 @; out--
839 + streq r3,[r1,#0] @; r1 = updated pointer to pointer
840 + @; to out
841 + beq inflate_fast_copy_exit
842 +
843 +inflate_fast_copy_gap16b_proc_lastfewbytes_loop:
844 +
845 + ldrb r12,[r2],#1 @; r12 = *from, from++
846 + subs r0,r0,#1 @; len--
847 + strb r12,[r3],#1 @; *out = r12, out++
848 +
849 + bne inflate_fast_copy_gap16b_proc_lastfewbytes_loop
850 +
851 + sub r3,r3,#1 @; out--
852 + str r3,[r1,#0] @; r1 = updated pointer to pointer
853 + @; to out
854 +
855 +inflate_fast_copy_exit:
856 +
857 + pop {r4-r11} @; pop r4-r11 from stack
858 + bx lr @; return
859 +
860 +.size inflate_fast_copy_neon, .-inflate_fast_copy_neon
861 +
862 +#endif
863 + .END
864 +
865 +
OLDNEW
« no previous file with comments | « no previous file | sys-libs/zlib/zlib-1.2.3-r1.ebuild » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698