| Index: sys-libs/zlib/files/zlib-1.2.3-neon-optimized.patch
|
| diff --git a/sys-libs/zlib/files/zlib-1.2.3-neon-optimized.patch b/sys-libs/zlib/files/zlib-1.2.3-neon-optimized.patch
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..3e7698e96bfd4a6400a15e4142391b7cc973d7b3
|
| --- /dev/null
|
| +++ b/sys-libs/zlib/files/zlib-1.2.3-neon-optimized.patch
|
| @@ -0,0 +1,865 @@
|
| +commit df426dcc3179a3647a695c0fde66e1b3616fa6c0
|
| +Author: John Alayari <jalayari@codeaurora.org>
|
| +Date: Thu Nov 18 16:24:17 2010 -0800
|
| +
|
| + commiting the PNG and ZLIB Neon Optimizations files to local git repsitory.
|
| +
|
| +diff --git a/Makefile.in b/Makefile.in
|
| +index 7da5a85..276c531 100644
|
| +--- a/Makefile.in
|
| ++++ b/Makefile.in
|
| +@@ -51,7 +51,7 @@ OBJS = adler32.o compress.o crc32.o gzio.o uncompr.o deflate.o trees.o \
|
| +
|
| + PIC_OBJS = $(OBJS:%.o=%.lo)
|
| +
|
| +-OBJA =
|
| ++OBJA =inflate_fast_copy_neon.o adler32_DO16_loop_neon.o
|
| + # to use the asm code: make OBJA=match.o
|
| +
|
| + TEST_OBJS = example.o minigzip.o
|
| +@@ -82,8 +82,12 @@ match.o: match.S
|
| + %.lo: %.c
|
| + $(CC) $(CFLAGS) -DPIC -fPIC -c $< -o $@
|
| +
|
| +-$(SHAREDLIBV): $(PIC_OBJS)
|
| +- $(LDSHARED) -o $@ $(PIC_OBJS) -lc $(LDFLAGS)
|
| ++%.o: %.S
|
| ++ $(CC) $(CFLAGS) -DPIC -fPIC -c $< -o $@
|
| ++
|
| ++
|
| ++$(SHAREDLIBV): $(PIC_OBJS) $(OBJA)
|
| ++ $(LDSHARED) -o $@ $(PIC_OBJS) $(OBJA) -lc $(LDFLAGS)
|
| + rm -f $(SHAREDLIB) $(SHAREDLIBM)
|
| + ln -s $@ $(SHAREDLIB)
|
| + ln -s $@ $(SHAREDLIBM)
|
| +diff --git a/adler32.c b/adler32.c
|
| +index 007ba26..a256e88 100644
|
| +--- a/adler32.c
|
| ++++ b/adler32.c
|
| +@@ -1,5 +1,6 @@
|
| + /* adler32.c -- compute the Adler-32 checksum of a data stream
|
| + * Copyright (C) 1995-2004 Mark Adler
|
| ++ * Copyright (c) 2010, Code Aurora Forum. All rights reserved.
|
| + * For conditions of distribution and use, see copyright notice in zlib.h
|
| + */
|
| +
|
| +@@ -18,6 +19,10 @@
|
| + #define DO8(buf,i) DO4(buf,i); DO4(buf,i+4);
|
| + #define DO16(buf) DO8(buf,0); DO8(buf,8);
|
| +
|
| ++#if defined(__ARM_NEON__)
|
| ++extern void adler32_DO16_loop_neon(unsigned char** , unsigned long *, unsigned long * , int );
|
| ++#endif
|
| ++
|
| + /* use NO_DIVIDE if your processor does not do division in hardware */
|
| + #ifdef NO_DIVIDE
|
| + # define MOD(a) \
|
| +@@ -96,17 +101,25 @@ uLong ZEXPORT adler32(adler, buf, len)
|
| + /* do length NMAX blocks -- requires just one modulo operation */
|
| + while (len >= NMAX) {
|
| + len -= NMAX;
|
| ++#if defined(__ARM_NEON__)
|
| ++ adler32_DO16_loop_neon(&buf, &adler, &sum2, NMAX);
|
| ++#else
|
| + n = NMAX / 16; /* NMAX is divisible by 16 */
|
| + do {
|
| + DO16(buf); /* 16 sums unrolled */
|
| + buf += 16;
|
| + } while (--n);
|
| ++#endif
|
| + MOD(adler);
|
| + MOD(sum2);
|
| + }
|
| +
|
| + /* do remaining bytes (less than NMAX, still just one modulo) */
|
| + if (len) { /* avoid modulos if none remaining */
|
| ++
|
| ++#if defined(__ARM_NEON__)
|
| ++ adler32_DO16_loop_neon(&buf, &adler, &sum2, len);
|
| ++#else
|
| + while (len >= 16) {
|
| + len -= 16;
|
| + DO16(buf);
|
| +@@ -116,6 +129,7 @@ uLong ZEXPORT adler32(adler, buf, len)
|
| + adler += *buf++;
|
| + sum2 += adler;
|
| + }
|
| ++#endif
|
| + MOD(adler);
|
| + MOD(sum2);
|
| + }
|
| +diff --git a/adler32_DO16_loop_neon.S b/adler32_DO16_loop_neon.S
|
| +new file mode 100755
|
| +index 0000000..1ba5147
|
| +--- /dev/null
|
| ++++ b/adler32_DO16_loop_neon.S
|
| +@@ -0,0 +1,195 @@
|
| ++#
|
| ++# Copyright (c) 2010, Code Aurora Forum. All rights reserved.
|
| ++#
|
| ++# Redistribution and use in source and binary forms, with or without
|
| ++# modification, are permitted provided that the following conditions
|
| ++# are met:
|
| ++# * Redistributions of source code must retain the above copyright
|
| ++# notice, this list of conditions and the following disclaimer.
|
| ++# * Redistributions in binary form must reproduce the above
|
| ++# copyright notice, this list of conditions and the following
|
| ++# disclaimer in the documentation and/or other materials provided
|
| ++# with the distribution.
|
| ++# * Neither the name of Code Aurora Forum, Inc. nor the names of it
|
| ++# contributors may be used to endorse or promote products derived
|
| ++# from this software without specific prior written permission.
|
| ++#
|
| ++# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
|
| ++# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
| ++# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMEN
|
| ++# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTOR
|
| ++# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, O
|
| ++# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
| ++# SUBSTITUTE GOODS OR SERVICES LOSS OF USE, DATA, OR PROFITS OR
|
| ++# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
| ++# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
| ++# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVE
|
| ++# IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| ++#
|
| ++#if defined(__ARM_NEON__)
|
| ++#======================================================================
|
| ++
|
| ++#======================================================================
|
| ++# Code Section
|
| ++
|
| ++ .code 32 @ Code is ARM ISA
|
| ++
|
| ++ .global adler32_DO16_loop_neon
|
| ++
|
| ++#======================================================================
|
| ++# Function Name(s) : adler32_DO16_loop_neon
|
| ++#
|
| ++# Function Parameters
|
| ++# r0 = pointer to buf
|
| ++# r1 = pointer to adler
|
| ++# r2 = pointer to sum
|
| ++# r3 = len
|
| ++#
|
| ++# Register Usage
|
| ++# q0, q1 = input data
|
| ++# d18,d24 = adler
|
| ++# d16,d28 = sum
|
| ++# d30 = multiplication factor
|
| ++#
|
| ++#======================================================================
|
| ++#
|
| ++# algorithm:
|
| ++#
|
| ++# while (length < loop_counter)
|
| ++# do
|
| ++# length = length - loop_counter
|
| ++# for i = 0 to loop_counter
|
| ++# adler = adler_begin + input[i]
|
| ++# for i = 0 to loop_counter
|
| ++# sum2 = sum2_begin+(loop_counter-i)*input[i]
|
| ++# sum2 = sum2+(adler_begin*loop_counter)
|
| ++# adler_begin = adler
|
| ++# sum2_begin = sum2
|
| ++# end
|
| ++# end
|
| ++# end
|
| ++#
|
| ++# Here loop counter holds values of 16, 8 and 1 to compute
|
| ++# adler and sum for 16 bytes, 8 bytes and 1 byte at a time
|
| ++# adler_begin and sum2_begin are used to hold the values
|
| ++# of adler and sum2 from previous iterations.
|
| ++#
|
| ++#======================================================================
|
| ++.balign 32
|
| ++.type adler32_DO16_loop_neon, %function
|
| ++
|
| ++adler32_DO16_loop_neon:
|
| ++ stmdb sp!,{r4-r7}
|
| ++ ldr r7,[r0]
|
| ++ vld1.32 {d18[0]},[r1] @load the input adler
|
| ++ vld1.32 {d16[0]},[r2] @load the input sum
|
| ++ ldr r5,=FACTOR16 @load the multiplication
|
| ++ @factors for data elements
|
| ++ vld1.8 {d20,d21},[r5] @load the multiplication
|
| ++ @factor for adler.
|
| ++ lsrs r4,r3,#4 @Calculate the number
|
| ++ @16 byte iterations
|
| ++ beq adler32_DO16_loop_16bytes_done
|
| ++ mov r6, #16
|
| ++ vmov.32 d30[0],r6
|
| ++
|
| ++adler32_DO16_loop_16bytes:
|
| ++ vld1.8 {d0,d1},[r7]! @load buf[0]..buf[15]
|
| ++ vpaddl.u8 d6, d0 @pair wise add to reduce
|
| ++ @8 elements to 4 and extend.
|
| ++ vpaddl.u8 d7,d1 @pair wise add to reduce 8
|
| ++ @elements to 4 and extend.
|
| ++ vpadd.u16 d24,d6,d7 @pair wise add (i.e. no
|
| ++ @need to extend 16 bits
|
| ++ @sufficient to hold the sum).
|
| ++ vpaddl.u16 d24,d24 @pair wise add to reduce 4
|
| ++ @elements to 2 and extend.
|
| ++ vpaddl.u32 d24,d24 @pair wise add to get the
|
| ++ @adler of 16 inputs no need
|
| ++ @to extend .. but only vpaddl
|
| ++ @adds pair wise on one
|
| ++ @doubleword.
|
| ++ vadd.u32 d24,d18,d24 @adler'=adler+adler_of_16_inputs
|
| ++ vmull.u8 q13,d20,d0 @sum'=mul_fac_for_inputs[0...7]
|
| ++ @ * buf[0..7].
|
| ++ vmlal.u8 q13,d21,d1 @sum'=sum'+ mul_fac_for_inputs
|
| ++ @[8...15] * buf[8..15].
|
| ++ vpadd.u16 d28,d26,d27 @pair wise add the doublewords
|
| ++ vpaddl.u16 d28,d28 @pair wise add to reduce 4
|
| ++ @elements to 2 and extend.
|
| ++ vpaddl.u32 d28,d28 @pair wise add
|
| ++ vadd.u32 d28,d16,d28 @sum' = sum + sum'
|
| ++ vmla.u32 d28,d18,d30 @sum' = sum' + (adler*
|
| ++ @mul_fac_for_adler).
|
| ++ vmov.u32 d18,d24 @save adler for next iteration.
|
| ++ vmov.u32 d16,d28 @save sum for next iteration.
|
| ++ sub r3,r3,#16
|
| ++ subs r4,r4,#1
|
| ++ bne adler32_DO16_loop_16bytes
|
| ++
|
| ++adler32_DO16_loop_16bytes_done:
|
| ++ lsrs r4, r3, #3 @find if there are atleast 8 bytes
|
| ++ beq adler32_DO16_loop_8bytes_done
|
| ++adler32_DO16_loop_8bytes:
|
| ++ vld1.8 {d0},[r7]! @load buf[0] .buf[7]
|
| ++ vpaddl.u8 d24,d0 @pair wise add to
|
| ++ @reduce 8 elements to 4
|
| ++ vpaddl.u16 d24,d24 @pair wise add to reduce
|
| ++ @4 elements to 2
|
| ++ vpaddl.u32 d24,d24 @pair wise add to get the
|
| ++ @adler for 8 inputs
|
| ++ vadd.u32 d24,d18,d24 @adler' = adler +
|
| ++ @adler_for_8_inputs.
|
| ++ vmull.u8 q13,d21,d0 @sum' = mul_fac_for_inputs[0..7]
|
| ++ @ * buf[0..7]
|
| ++ vpadd.u16 d28,d26,d27 @pair wise add to reduce 8
|
| ++ @elements to 4
|
| ++ vpaddl.u16 d28,d28 @pair wise add to reduce 4
|
| ++ @elements to 2.
|
| ++ vpaddl.u32 d28,d28 @pair wise add
|
| ++ vadd.u32 d28,d16,d28 @sum' = sum + sum'
|
| ++ vshl.u32 d30,d18,#3 @adler" = adler * 8
|
| ++ vadd.u32 d28,d28,d30 @sum' = sum' + adler"
|
| ++ vmov.u32 d18,d24 @save adler for next iteration
|
| ++ vmov.u32 d16,d28 @save sum for next iteration
|
| ++ sub r3,r3,#8
|
| ++
|
| ++adler32_DO16_loop_8bytes_done:
|
| ++ cmp r3, #0 @find if there are remaining bytes
|
| ++ beq DONE @after profiling found that a loop
|
| ++ @to compute 4 or 2 bytes at a time
|
| ++ @is less efficient than a byte by
|
| ++ @byte computation.
|
| ++ vmov.u64 d3, #0
|
| ++
|
| ++adler32_DO16_loop_remaining:
|
| ++ vld1.8 {d3[0]}, [r7]! @load 1 byte of input
|
| ++ subs r3,r3,#1
|
| ++ vadd.u32 d24,d3,d18 @adler' = adler + *buf
|
| ++ vadd.u32 d28,d24,d16 @sum' = sum + adler'
|
| ++ vmov.u32 d18,d24
|
| ++ vmov.u32 d16,d28
|
| ++ bne adler32_DO16_loop_remaining
|
| ++
|
| ++DONE:
|
| ++ vst1.32 {d24[0]},[r1]
|
| ++ vst1.32 {d28[0]},[r2]
|
| ++ str r7, [r0]
|
| ++ ldmia sp!, {r4-r7}
|
| ++ bx lr
|
| ++
|
| ++.size adler32_DO16_loop_neon, .-adler32_DO16_loop_neon
|
| ++
|
| ++.balign 16
|
| ++#======================================================================
|
| ++#FACTOR16 provides the multiplication factors for the inputs for 16
|
| ++#byte loops. The second half (i.e. 8 to 1) has the multiplications
|
| ++#factors for 8 byte loops.
|
| ++#======================================================================
|
| ++
|
| ++FACTOR16:
|
| ++ .byte 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
|
| ++
|
| ++#endif
|
| ++ .END
|
| ++
|
| +diff --git a/inffast.c b/inffast.c
|
| +index bbee92e..943cb92 100644
|
| +--- a/inffast.c
|
| ++++ b/inffast.c
|
| +@@ -8,6 +8,10 @@
|
| + #include "inflate.h"
|
| + #include "inffast.h"
|
| +
|
| ++#if defined(__ARM_NEON__)
|
| ++extern void inflate_fast_copy_neon(unsigned len, unsigned char **out, unsigned char *from);
|
| ++#endif
|
| ++
|
| + #ifndef ASMINF
|
| +
|
| + /* Allow machine dependent optimization for post-increment or pre-increment.
|
| +@@ -231,6 +235,9 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */
|
| + from = out - dist; /* rest from output */
|
| + }
|
| + }
|
| ++#if defined(__ARM_NEON__)
|
| ++ inflate_fast_copy_neon(len, &out, from);
|
| ++#else
|
| + while (len > 2) {
|
| + PUP(out) = PUP(from);
|
| + PUP(out) = PUP(from);
|
| +@@ -242,9 +249,13 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */
|
| + if (len > 1)
|
| + PUP(out) = PUP(from);
|
| + }
|
| ++#endif
|
| + }
|
| + else {
|
| + from = out - dist; /* copy direct from output */
|
| ++#if defined(__ARM_NEON__)
|
| ++ inflate_fast_copy_neon(len, &out, from);
|
| ++#else
|
| + do { /* minimum length is three */
|
| + PUP(out) = PUP(from);
|
| + PUP(out) = PUP(from);
|
| +@@ -256,6 +267,7 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */
|
| + if (len > 1)
|
| + PUP(out) = PUP(from);
|
| + }
|
| ++#endif
|
| + }
|
| + }
|
| + else if ((op & 64) == 0) { /* 2nd level distance code */
|
| +diff --git a/inflate_fast_copy_neon.S b/inflate_fast_copy_neon.S
|
| +new file mode 100755
|
| +index 0000000..ec1e4ab
|
| +--- /dev/null
|
| ++++ b/inflate_fast_copy_neon.S
|
| +@@ -0,0 +1,521 @@
|
| ++#; Copyright (c) 2010, Code Aurora Forum. All rights reserved.
|
| ++#;
|
| ++#; Redistribution and use in source and binary forms, with or without
|
| ++#; modification, are permitted provided that the following conditions are
|
| ++#; met:
|
| ++#; * Redistributions of source code must retain the above copyright
|
| ++#; notice, this list of conditions and the following disclaimer.
|
| ++#; * Redistributions in binary form must reproduce the above
|
| ++#; copyright notice, this list of conditions and the following
|
| ++#; disclaimer in the documentation and/or other materials provided
|
| ++#; with the distribution.
|
| ++#; * Neither the name of Code Aurora Forum, Inc. nor the names of its
|
| ++#; contributors may be used to endorse or promote products derived
|
| ++#; from this software without specific prior written permission.
|
| ++#;
|
| ++#; THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
|
| ++#; WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
| ++#; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT
|
| ++#; ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
|
| ++#; BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
| ++#; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
| ++#; SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
| ++#; BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
| ++#; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
| ++#; OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
|
| ++#; IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| ++#;
|
| ++#;
|
| ++#if defined(__ARM_NEON__)
|
| ++#;============================================================================
|
| ++#; Code Section
|
| ++ .code 32 @; Code is ARM ISA
|
| ++#;============================================================================
|
| ++
|
| ++ .global inflate_fast_copy_neon
|
| ++
|
| ++
|
| ++#;============================================================================
|
| ++#; INPUTS: r0 len: number of bytes to transfer
|
| ++#; r1 **out: pointer to pointer to ``out'' buffer
|
| ++#; r2 *from: pointer to ``from'' buffer
|
| ++#; OUTPUTS: r1 **out: pointer to pointer to ``out'' buffer
|
| ++#;============================================================================
|
| ++.balign 32
|
| ++.type inflate_fast_copy_neon, %function
|
| ++inflate_fast_copy_neon:
|
| ++ push {r4-r11} @; push r4-r11 onto stack
|
| ++
|
| ++ cmp r0,#16 @;
|
| ++ bge inflate_fast_copy_vectorized
|
| ++
|
| ++ #;; transfer bytes one by one
|
| ++ #;; only if len < 16 bytes
|
| ++inflate_fast_copy_default:
|
| ++
|
| ++ cmp r0,#0
|
| ++ beq inflate_fast_copy_exit
|
| ++
|
| ++ ldr r3,[r1,#0] @; r3 = pointer to out
|
| ++
|
| ++inflate_fast_copy_default_loop:
|
| ++
|
| ++ ldrb r12,[r2,#1]! @; r12 = *(++from)
|
| ++ subs r0,r0,#1 @; len--
|
| ++ strb r12,[r3,#1]! @; *(++out) = r12
|
| ++
|
| ++ bne inflate_fast_copy_default_loop
|
| ++
|
| ++ str r3,[r1,#0] @; r1 = updated pointer to pointer
|
| ++ @; to out
|
| ++ b inflate_fast_copy_exit
|
| ++
|
| ++ #;; vectorized copy routines
|
| ++ #;; only if len > 16 bytes
|
| ++inflate_fast_copy_vectorized:
|
| ++
|
| ++ ldr r3,[r1,#0] @; r3 = pointer to out
|
| ++ @; DON'T TOUCH r1 UNTIL FINAL
|
| ++ @; UPDATE OF r1 WITH ADDRESS OF r3
|
| ++ cmp r3,r2 @
|
| ++ sublt r4,r2,r3 @
|
| ++ subge r4,r3,r2 @;r4 = gap = |out-from|
|
| ++
|
| ++ cmp r4,#0
|
| ++ beq inflate_fast_copy_exit
|
| ++
|
| ++ cmp r4,#1
|
| ++ beq inflate_fast_copy_gap1b_proc
|
| ++
|
| ++ cmp r4,#2
|
| ++ beq inflate_fast_copy_gap2b_proc
|
| ++
|
| ++ cmp r4,#3
|
| ++ beq inflate_fast_copy_gap3b_proc
|
| ++
|
| ++ cmp r4,#4
|
| ++ beq inflate_fast_copy_gap4b_proc
|
| ++
|
| ++ cmp r4,#8
|
| ++ blt inflate_fast_copy_gap5to7b_proc
|
| ++ beq inflate_fast_copy_gap8b_proc
|
| ++
|
| ++ cmp r4,#16
|
| ++ blt inflate_fast_copy_gap9to15b_proc
|
| ++ bge inflate_fast_copy_gap16b_proc
|
| ++
|
| ++
|
| ++ #;; ------------------------------------------------------------------
|
| ++ #;; vectorized copy routine when gap between ``from'' and ``out''
|
| ++ #;; buffers is 1 byte
|
| ++ #;; INPUTS:
|
| ++ #;; r0 = len
|
| ++ #;; r2 = pointer to from
|
| ++ #;; r3 = pointer to out
|
| ++ #;; OUTPUTS:
|
| ++ #;; r1 = pointer to pointer to out
|
| ++ #;; ------------------------------------------------------------------
|
| ++inflate_fast_copy_gap1b_proc:
|
| ++
|
| ++ add r3,r3,#1 @; out++
|
| ++ @
|
| ++ ldrb r12,[r2,#1]! @; r12 = *(++from)
|
| ++ vdup.8 q0, r12 @; duplicate r12 16 times in q0
|
| ++ @
|
| ++ lsrs r4,r0,#4 @; r4 = floor(len/16)
|
| ++ @; = iteration count for loop16
|
| ++ beq inflate_fast_copy_gap1b_proc_16bytes_loop_done
|
| ++
|
| ++inflate_fast_copy_gap1b_proc_16bytes_loop:
|
| ++
|
| ++ vst1.8 {q0},[r3]! @; store 16 bytes in out and
|
| ++ @; increment out pointer
|
| ++ sub r0,r0,#16 @; subtract 16 from len
|
| ++ subs r4,r4,#1 @; decrement iteration count
|
| ++ bne inflate_fast_copy_gap1b_proc_16bytes_loop
|
| ++
|
| ++inflate_fast_copy_gap1b_proc_16bytes_loop_done:
|
| ++
|
| ++ cmp r0,#0
|
| ++ subeq r3,r3,#1 @; out--
|
| ++ streq r3,[r1,#0] @; r1 = updated pointer to pointer
|
| ++ @; to out
|
| ++ beq inflate_fast_copy_exit
|
| ++
|
| ++inflate_fast_copy_gap1b_proc_lastfewbytes_loop:
|
| ++
|
| ++ strb r12,[r3],#1 @; *out = r12, out++
|
| ++ subs r0,r0,#1 @; len--
|
| ++ bne inflate_fast_copy_gap1b_proc_lastfewbytes_loop
|
| ++
|
| ++ sub r3,r3,#1 @; out--
|
| ++ str r3,[r1,#0] @; r1 = updated pointer to pointer
|
| ++ @; to out
|
| ++ b inflate_fast_copy_exit
|
| ++
|
| ++ #;; ------------------------------------------------------------------
|
| ++ #;; vectorized copy routine when gap between ``from'' and ``out''
|
| ++ #;; buffers is 2 bytes
|
| ++ #;; INPUTS:
|
| ++ #;; r0 = len
|
| ++ #;; r2 = pointer to from
|
| ++ #;; r3 = pointer to out
|
| ++ #;; OUTPUTS:
|
| ++ #;; r1 = pointer to pointer to out
|
| ++ #;; ------------------------------------------------------------------
|
| ++inflate_fast_copy_gap2b_proc:
|
| ++
|
| ++ add r2,r2,#1 @; from++
|
| ++ add r3,r3,#1 @; out++
|
| ++ @
|
| ++ vld1.16 {d0[0]},[r2] @; load 2 bytes into d0[0]
|
| ++ vdup.16 q0,d0[0] @; duplicate those 2 bytes 8 times
|
| ++ @; to fill up q0
|
| ++ @
|
| ++ lsrs r4,r0,#4 @; r4 = floor(len/16)
|
| ++ @; = iteration count for loop16
|
| ++ beq inflate_fast_copy_gap2b_proc_16bytes_loop_done
|
| ++
|
| ++inflate_fast_copy_gap2b_proc_16bytes_loop:
|
| ++
|
| ++ vst1.8 {q0},[r3]! @; store 16 bytes in out and
|
| ++ @; increment out pointer
|
| ++ sub r0,r0,#16 @; subtract 16 from len
|
| ++ subs r4,r4,#1 @; decrement iteration count
|
| ++ bne inflate_fast_copy_gap2b_proc_16bytes_loop
|
| ++
|
| ++inflate_fast_copy_gap2b_proc_16bytes_loop_done:
|
| ++
|
| ++ cmp r0,#0
|
| ++ subeq r3,r3,#1 @; out--
|
| ++ streq r3,[r1,#0] @; r1 = updated pointer to pointer
|
| ++ @; to out
|
| ++ beq inflate_fast_copy_exit
|
| ++
|
| ++inflate_fast_copy_gap2b_proc_lastfewbytes_loop:
|
| ++
|
| ++ ldrb r12,[r2],#1 @; r12 = *from, from++
|
| ++ subs r0,r0,#1 @; len--
|
| ++ strb r12,[r3],#1 @; *out = r12, out++
|
| ++ @
|
| ++ bne inflate_fast_copy_gap2b_proc_lastfewbytes_loop
|
| ++
|
| ++ sub r3,r3,#1 @; out--
|
| ++ str r3,[r1,#0] @; r1 = updated pointer to pointer
|
| ++ @; to out
|
| ++ b inflate_fast_copy_exit
|
| ++
|
| ++ #;; ------------------------------------------------------------------
|
| ++ #;; vectorized copy routine when gap between ``from'' and ``out''
|
| ++ #;; buffers is 3 bytes
|
| ++ #;; INPUTS:
|
| ++ #;; r0 = len
|
| ++ #;; r2 = pointer to from
|
| ++ #;; r3 = pointer to out
|
| ++ #;; r4 = 3
|
| ++ #;; OUTPUTS:
|
| ++ #;; r1 = pointer to pointer to out
|
| ++ #;; ------------------------------------------------------------------
|
| ++inflate_fast_copy_gap3b_proc:
|
| ++
|
| ++ add r2,r2,#1 @; from++
|
| ++ add r3,r3,#1 @; out++
|
| ++ @
|
| ++ vld1.32 {d0[0]},[r2] @; load 4 bytes into d0[0]
|
| ++
|
| ++inflate_fast_copy_gap3b_proc_3bytes_loop:
|
| ++
|
| ++ cmp r0,#3 @; exit loop if len < 3
|
| ++ blt inflate_fast_copy_gap3b_proc_3bytes_loop_done
|
| ++
|
| ++ vst1.32 {d0[0]},[r3],r4 @; store 4 bytes in out
|
| ++ @; out+=3
|
| ++
|
| ++ sub r0,r0,#3 @; len-=3
|
| ++ b inflate_fast_copy_gap3b_proc_3bytes_loop
|
| ++
|
| ++inflate_fast_copy_gap3b_proc_3bytes_loop_done:
|
| ++
|
| ++ cmp r0,#0
|
| ++ subeq r3,r3,#1 @; out--
|
| ++ streq r3,[r1,#0] @; r1 = updated pointer to pointer
|
| ++ @; to out
|
| ++ beq inflate_fast_copy_exit
|
| ++
|
| ++inflate_fast_copy_gap3b_proc_lastfewbytes_loop:
|
| ++
|
| ++ ldrb r12,[r2],#1 @; r12 = *from, from++
|
| ++ subs r0,r0,#1 @; len--
|
| ++ strb r12,[r3],#1 @; *out = r12, out++
|
| ++
|
| ++ bne inflate_fast_copy_gap3b_proc_lastfewbytes_loop
|
| ++
|
| ++ sub r3,r3,#1 @; out--
|
| ++ str r3,[r1,#0] @; r1 = updated pointer to pointer
|
| ++ @; to out
|
| ++ b inflate_fast_copy_exit
|
| ++
|
| ++ #;; ------------------------------------------------------------------
|
| ++ #;; vectorized copy routine when gap between ``from'' and ``out''
|
| ++ #;; buffers is 4 bytes
|
| ++ #;; INPUTS:
|
| ++ #;; r0 = len
|
| ++ #;; r2 = pointer to from
|
| ++ #;; r3 = pointer to out
|
| ++ #;; OUTPUTS:
|
| ++ #;; r1 = pointer to pointer to out
|
| ++ #;; ------------------------------------------------------------------
|
| ++inflate_fast_copy_gap4b_proc:
|
| ++
|
| ++ add r2,r2,#1 @; from++
|
| ++ add r3,r3,#1 @; out++
|
| ++ @
|
| ++ vld1.32 {d0[0]},[r2] @; load 4 bytes into d0[0]
|
| ++ vdup.32 q0,d0[0] @; duplicate those 4 bytes 4 times
|
| ++ @; to fill up q0
|
| ++ @
|
| ++ lsrs r4,r0,#4 @; r4 = floor(len/16)
|
| ++ @; = iteration count for loop16
|
| ++ beq inflate_fast_copy_gap4b_proc_16bytes_loop_done
|
| ++
|
| ++inflate_fast_copy_gap4b_proc_16bytes_loop:
|
| ++
|
| ++ vst1.32 {q0},[r3]! @; store 16 bytes in out and
|
| ++ @; increment out pointer
|
| ++ sub r0,r0,#16 @; subtract 16 from len
|
| ++ subs r4,r4,#1 @; decrement iteration count
|
| ++ bne inflate_fast_copy_gap4b_proc_16bytes_loop
|
| ++
|
| ++inflate_fast_copy_gap4b_proc_16bytes_loop_done:
|
| ++
|
| ++ cmp r0,#0
|
| ++ subeq r3,r3,#1 @; out--
|
| ++ streq r3,[r1,#0] @; r1 = updated pointer to pointer
|
| ++ @; to out
|
| ++ beq inflate_fast_copy_exit
|
| ++
|
| ++inflate_fast_copy_gap4b_proc_lastfewbytes_loop:
|
| ++
|
| ++ ldrb r12,[r2],#1 @; r12 = *from, from++
|
| ++ subs r0,r0,#1 @; len--
|
| ++ strb r12,[r3],#1 @; *out = r12, out++
|
| ++
|
| ++ bne inflate_fast_copy_gap4b_proc_lastfewbytes_loop
|
| ++
|
| ++ sub r3,r3,#1 @; out--
|
| ++ str r3,[r1,#0] @; r1 = updated pointer to pointer
|
| ++ @; to out
|
| ++ b inflate_fast_copy_exit
|
| ++
|
| ++ #;; ------------------------------------------------------------------
|
| ++ #;; vectorized copy routine when gap between ``from'' and ``out''
|
| ++ #;; buffers is {5-7} bytes
|
| ++ #;; INPUTS:
|
| ++ #;; r0 = len
|
| ++ #;; r2 = pointer to from
|
| ++ #;; r3 = pointer to out
|
| ++ #;; r4 = {5-7}
|
| ++ #;; OUTPUTS:
|
| ++ #;; r1 = pointer to pointer to out
|
| ++ #;; ------------------------------------------------------------------
|
| ++inflate_fast_copy_gap5to7b_proc:
|
| ++
|
| ++ add r2,r2,#1 @; from++
|
| ++ add r3,r3,#1 @; out++
|
| ++ @
|
| ++ vld1.8 {d0},[r2] @; load 8 bytes into d0
|
| ++
|
| ++inflate_fast_copy_gap5to7b_proc_5to7bytes_loop:
|
| ++
|
| ++ cmp r0,r4 @; exit loop if len < {5-7}
|
| ++ blt inflate_fast_copy_gap5to7b_proc_5to7bytes_loop_done
|
| ++
|
| ++ vst1.8 {d0},[r3],r4 @; store 8 bytes in out
|
| ++ @; out+={5-7}
|
| ++
|
| ++ sub r0,r0,r4 @; len-={5-7}
|
| ++ b inflate_fast_copy_gap5to7b_proc_5to7bytes_loop
|
| ++
|
| ++inflate_fast_copy_gap5to7b_proc_5to7bytes_loop_done:
|
| ++
|
| ++ cmp r0,#0
|
| ++ subeq r3,r3,#1 @; out--
|
| ++ streq r3,[r1,#0] @; r1 = updated pointer to pointer
|
| ++ @; to out
|
| ++ beq inflate_fast_copy_exit
|
| ++
|
| ++inflate_fast_copy_gap5to7b_proc_lastfewbytes_loop:
|
| ++
|
| ++ ldrb r12,[r2],#1 @; r12 = *from, from++
|
| ++ subs r0,r0,#1 @; len--
|
| ++ strb r12,[r3],#1 @; *out = r12, out++
|
| ++
|
| ++ bne inflate_fast_copy_gap5to7b_proc_lastfewbytes_loop
|
| ++
|
| ++ sub r3,r3,#1 @; out--
|
| ++ str r3,[r1,#0] @; r1 = updated pointer to pointer
|
| ++ @; to out
|
| ++ b inflate_fast_copy_exit
|
| ++
|
| ++ #;; ------------------------------------------------------------------
|
| ++ #;; vectorized copy routine when gap between ``from'' and ``out''
|
| ++ #;; buffers is 8 bytes
|
| ++ #;; INPUTS:
|
| ++ #;; r0 = len
|
| ++ #;; r2 = pointer to from
|
| ++ #;; r3 = pointer to out
|
| ++ #;; OUTPUTS:
|
| ++ #;; r1 = pointer to pointer to out
|
| ++ #;; ------------------------------------------------------------------
|
| ++inflate_fast_copy_gap8b_proc:
|
| ++
|
| ++ add r2,r2,#1 @; from++
|
| ++ add r3,r3,#1 @; out++
|
| ++ @
|
| ++ vld1.8 {d0},[r2] @; load 8 bytes into d0
|
| ++ vmov d1,d0 @; duplicate the 8 bytes to fill up
|
| ++ @; q0
|
| ++ @
|
| ++ lsrs r4,r0,#4 @; r4 = floor(len/16)
|
| ++ @; = iteration count for loop16
|
| ++ beq inflate_fast_copy_gap8b_proc_16bytes_loop_done
|
| ++
|
| ++inflate_fast_copy_gap8b_proc_16bytes_loop:
|
| ++
|
| ++ vst1.8 {q0},[r3]! @; store 16 bytes in out and
|
| ++ @; increment out pointer
|
| ++ sub r0,r0,#16 @; subtract 16 from len
|
| ++ subs r4,r4,#1 @; decrement iteration count
|
| ++ bne inflate_fast_copy_gap8b_proc_16bytes_loop
|
| ++
|
| ++inflate_fast_copy_gap8b_proc_16bytes_loop_done:
|
| ++
|
| ++ cmp r0,#0
|
| ++ subeq r3,r3,#1 @; out--
|
| ++ streq r3,[r1,#0] @; r1 = updated pointer to pointer
|
| ++ @; to out
|
| ++ beq inflate_fast_copy_exit
|
| ++
|
| ++inflate_fast_copy_gap8b_proc_lastfewbytes_loop:
|
| ++
|
| ++ ldrb r12,[r2],#1 @; r12 = *from, from++
|
| ++ subs r0,r0,#1 @; len--
|
| ++ strb r12,[r3],#1 @; *out = r12, out++
|
| ++
|
| ++ bne inflate_fast_copy_gap8b_proc_lastfewbytes_loop
|
| ++
|
| ++ sub r3,r3,#1 @; out--
|
| ++ str r3,[r1,#0] @; r1 = updated pointer to pointer
|
| ++ @; to out
|
| ++ b inflate_fast_copy_exit
|
| ++
|
| ++ #;; ------------------------------------------------------------------
|
| ++ #;; vectorized copy routine when gap between ``from'' and ``out''
|
| ++ #;; buffers is {9-15} bytes
|
| ++ #;; INPUTS:
|
| ++ #;; r0 = len
|
| ++ #;; r2 = pointer to from
|
| ++ #;; r3 = pointer to out
|
| ++ #;; r4 = {9-15}
|
| ++ #;; OUTPUTS:
|
| ++ #;; r1 = pointer to pointer to out
|
| ++ #;; ------------------------------------------------------------------
|
| ++inflate_fast_copy_gap9to15b_proc:
|
| ++
|
| ++ add r2,r2,#1 @; from++
|
| ++ add r3,r3,#1 @; out++
|
| ++ @
|
| ++ vld1.8 {q0},[r2] @; load 16 bytes into q0
|
| ++
|
| ++inflate_fast_copy_gap9to15b_proc_9to15bytes_loop:
|
| ++
|
| ++ cmp r0, r4 @; exit loop if len < {9-15}
|
| ++ blt inflate_fast_copy_gap9to15b_proc_9to15bytes_loop_done
|
| ++
|
| ++ vst1.8 {q0},[r3],r4 @; store 16 bytes in out
|
| ++ @; out+={9-15}
|
| ++
|
| ++ sub r0,r0,r4 @; len-={9-15}
|
| ++ b inflate_fast_copy_gap9to15b_proc_9to15bytes_loop
|
| ++
|
| ++inflate_fast_copy_gap9to15b_proc_9to15bytes_loop_done:
|
| ++
|
| ++ cmp r0,#0
|
| ++ subeq r3,r3,#1 @; out--
|
| ++ streq r3,[r1,#0] @; r1 = updated pointer to pointer
|
| ++ @; to out
|
| ++ beq inflate_fast_copy_exit
|
| ++
|
| ++inflate_fast_copy_gap9to15b_proc_lastfewbytes_loop:
|
| ++
|
| ++ ldrb r12,[r2],#1 @; r12 = *from, from++
|
| ++ subs r0,r0,#1 @; len--
|
| ++ strb r12,[r3],#1 @; *out = r12, out++
|
| ++
|
| ++ bne inflate_fast_copy_gap9to15b_proc_lastfewbytes_loop
|
| ++
|
| ++ sub r3,r3,#1 @; out--
|
| ++ str r3,[r1,#0] @; r1 = updated pointer to pointer
|
| ++ @; to out
|
| ++ b inflate_fast_copy_exit
|
| ++
|
| ++ #;; ------------------------------------------------------------------
|
| ++ #;; vectorized copy routine when gap between ``from'' and ``out''
|
| ++ #;; buffers is 16 bytes or more
|
| ++ #;; INPUTS:
|
| ++ #;; r0 = len
|
| ++ #;; r2 = pointer to from
|
| ++ #;; r3 = pointer to out
|
| ++ #;; OUTPUTS:
|
| ++ #;; r1 = pointer to pointer to out
|
| ++ #;; ------------------------------------------------------------------
|
| ++inflate_fast_copy_gap16b_proc:
|
| ++
|
| ++ add r2,r2,#1 @; from++
|
| ++ add r3,r3,#1 @; out++
|
| ++ @
|
| ++ lsrs r4,r0,#4 @; r4 = floor(len/16)
|
| ++ @; = iteration count for loop16
|
| ++ beq inflate_fast_copy_gap16b_proc_16bytes_loop_done
|
| ++
|
| ++inflate_fast_copy_gap16b_proc_16bytes_loop:
|
| ++
|
| ++ vld1.8 {q0},[r2]! @; load 16 bytes into q0 and
|
| ++ @; increment from pointer
|
| ++ vst1.8 {q0},[r3]! @; store 16 bytes in out and
|
| ++ @; increment out pointer
|
| ++ sub r0,r0,#16 @; subtract 16 from len
|
| ++ subs r4,r4,#1 @; decrement iteration count
|
| ++ bne inflate_fast_copy_gap16b_proc_16bytes_loop
|
| ++
|
| ++inflate_fast_copy_gap16b_proc_16bytes_loop_done:
|
| ++
|
| ++ cmp r0,#0
|
| ++ subeq r3,r3,#1 @; out--
|
| ++ streq r3,[r1,#0] @; r1 = updated pointer to pointer
|
| ++ @; to out
|
| ++ beq inflate_fast_copy_exit
|
| ++
|
| ++inflate_fast_copy_gap16b_proc_lastfewbytes_loop:
|
| ++
|
| ++ ldrb r12,[r2],#1 @; r12 = *from, from++
|
| ++ subs r0,r0,#1 @; len--
|
| ++ strb r12,[r3],#1 @; *out = r12, out++
|
| ++
|
| ++ bne inflate_fast_copy_gap16b_proc_lastfewbytes_loop
|
| ++
|
| ++ sub r3,r3,#1 @; out--
|
| ++ str r3,[r1,#0] @; r1 = updated pointer to pointer
|
| ++ @; to out
|
| ++
|
| ++inflate_fast_copy_exit:
|
| ++
|
| ++ pop {r4-r11} @; pop r4-r11 from stack
|
| ++ bx lr @; return
|
| ++
|
| ++.size inflate_fast_copy_neon, .-inflate_fast_copy_neon
|
| ++
|
| ++#endif
|
| ++ .END
|
| ++
|
| ++
|
|
|