| Index: src/opts/memset16_neon.S
|
| diff --git a/src/opts/memset16_neon.S b/src/opts/memset16_neon.S
|
| deleted file mode 100644
|
| index b39832fff163627a532ef34209fb9309717c2e06..0000000000000000000000000000000000000000
|
| --- a/src/opts/memset16_neon.S
|
| +++ /dev/null
|
| @@ -1,143 +0,0 @@
|
| -/***************************************************************************
|
| - * Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
|
| - *
|
| - * Use of this source code is governed by a BSD-style license that can be
|
| - * found in the LICENSE file.
|
| - ***************************************************************************/
|
| -
|
| -/***************************************************************************
|
| - Neon memset: Attempts to do a memset with Neon registers if possible,
|
| - Inputs:
|
| - s: The buffer to write to
|
| - c: The integer data to write to the buffer
|
| - n: The size_t count.
|
| - Outputs:
|
| -
|
| -***************************************************************************/
|
| -
|
| - .syntax unified
|
| - .code 32
|
| - .fpu neon
|
| - .align 4
|
| - .globl memset16_neon
|
| - .hidden memset16_neon
|
| -
|
| -memset16_neon:
|
| - cmp r2, #0
|
| - bxeq lr
|
| -
|
| - /* Keep in mind that r2 -- the count argument -- is for the
|
| - * number of 16-bit items to copy.
|
| - */
|
| - lsl r2, r2, #1
|
| -
|
| - push {r0}
|
| -
|
| - /* If we have < 8 bytes, just do a quick loop to handle that */
|
| - cmp r2, #8
|
| - bgt memset_gt4
|
| -memset_smallcopy_loop:
|
| - strh r1, [r0], #2
|
| - subs r2, r2, #2
|
| - bne memset_smallcopy_loop
|
| -memset_smallcopy_done:
|
| - pop {r0}
|
| - bx lr
|
| -
|
| -memset_gt4:
|
| - /*
|
| - * Duplicate the r1 lowest 16-bits across r1. The idea is to have
|
| - * a register with two 16-bit-values we can copy. We do this by
|
| - * duplicating lowest 16-bits of r1 to upper 16-bits.
|
| - */
|
| - orr r1, r1, r1, lsl #16
|
| - /*
|
| - * If we're copying > 64 bytes, then we may want to get
|
| - * onto a 16-byte boundary to improve speed even more.
|
| - */
|
| - cmp r2, #64
|
| - blt memset_route
|
| - ands r12, r0, #0xf
|
| - beq memset_route
|
| - /*
|
| - * Determine the number of bytes to move forward to get to the 16-byte
|
| - * boundary. Note that this will be a multiple of 4, since we
|
| - * already are word-aligned.
|
| - */
|
| - rsb r12, r12, #16
|
| - sub r2, r2, r12
|
| - lsls r12, r12, #29
|
| - strmi r1, [r0], #4
|
| - strcs r1, [r0], #4
|
| - strcs r1, [r0], #4
|
| - lsls r12, r12, #2
|
| - strhcs r1, [r0], #2
|
| -memset_route:
|
| - /*
|
| - * Decide where to route for the maximum copy sizes. Note that we
|
| - * build q0 and q1 depending on if we'll need it, so that's
|
| - * interwoven here as well.
|
| - */
|
| - vdup.u32 d0, r1
|
| - cmp r2, #16
|
| - blt memset_8
|
| - vmov d1, d0
|
| - cmp r2, #64
|
| - blt memset_16
|
| - vmov q1, q0
|
| - cmp r2, #128
|
| - blt memset_32
|
| -memset_128:
|
| - mov r12, r2, lsr #7
|
| -memset_128_loop:
|
| - vst1.64 {q0, q1}, [r0]!
|
| - vst1.64 {q0, q1}, [r0]!
|
| - vst1.64 {q0, q1}, [r0]!
|
| - vst1.64 {q0, q1}, [r0]!
|
| - subs r12, r12, #1
|
| - bne memset_128_loop
|
| - ands r2, r2, #0x7f
|
| - beq memset_end
|
| -memset_32:
|
| - movs r12, r2, lsr #5
|
| - beq memset_16
|
| -memset_32_loop:
|
| - subs r12, r12, #1
|
| - vst1.64 {q0, q1}, [r0]!
|
| - bne memset_32_loop
|
| - ands r2, r2, #0x1f
|
| - beq memset_end
|
| -memset_16:
|
| - movs r12, r2, lsr #4
|
| - beq memset_8
|
| -memset_16_loop:
|
| - subs r12, r12, #1
|
| - vst1.32 {q0}, [r0]!
|
| - bne memset_16_loop
|
| - ands r2, r2, #0xf
|
| - beq memset_end
|
| - /*
|
| - * memset_8 isn't a loop, since we try to do our loops at 16
|
| - * bytes and above. We should loop there, then drop down here
|
| - * to finish the <16-byte versions. Same for memset_4 and
|
| - * memset_1.
|
| - */
|
| -memset_8:
|
| - cmp r2, #8
|
| - blt memset_4
|
| - subs r2, r2, #8
|
| - vst1.32 {d0}, [r0]!
|
| -memset_4:
|
| - cmp r2, #4
|
| - blt memset_2
|
| - subs r2, r2, #4
|
| - str r1, [r0], #4
|
| -memset_2:
|
| - cmp r2, #0
|
| - ble memset_end
|
| - strh r1, [r0], #2
|
| -memset_end:
|
| - pop {r0}
|
| - bx lr
|
| -
|
| - .end
|
|
|