Index: src/opts/memset16_neon.S |
diff --git a/src/opts/memset16_neon.S b/src/opts/memset16_neon.S |
deleted file mode 100644 |
index b39832fff163627a532ef34209fb9309717c2e06..0000000000000000000000000000000000000000 |
--- a/src/opts/memset16_neon.S |
+++ /dev/null |
@@ -1,143 +0,0 @@ |
-/*************************************************************************** |
- * Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved. |
- * |
- * Use of this source code is governed by a BSD-style license that can be |
- * found in the LICENSE file. |
- ***************************************************************************/ |
- |
-/*************************************************************************** |
- Neon memset: Attempts to do a memset with Neon registers if possible, |
- Inputs: |
- s: The buffer to write to |
- c: The integer data to write to the buffer |
- n: The size_t count. |
- Outputs: |
- |
-***************************************************************************/ |
- |
- .syntax unified |
- .code 32 |
- .fpu neon |
- .align 4 |
- .globl memset16_neon |
- .hidden memset16_neon |
- |
-memset16_neon: |
- cmp r2, #0 |
- bxeq lr |
- |
- /* Keep in mind that r2 -- the count argument -- is for the |
- * number of 16-bit items to copy. |
- */ |
- lsl r2, r2, #1 |
- |
- push {r0} |
- |
- /* If we have < 8 bytes, just do a quick loop to handle that */ |
- cmp r2, #8 |
- bgt memset_gt4 |
-memset_smallcopy_loop: |
- strh r1, [r0], #2 |
- subs r2, r2, #2 |
- bne memset_smallcopy_loop |
-memset_smallcopy_done: |
- pop {r0} |
- bx lr |
- |
-memset_gt4: |
- /* |
- * Duplicate the r1 lowest 16-bits across r1. The idea is to have |
- * a register with two 16-bit-values we can copy. We do this by |
- * duplicating lowest 16-bits of r1 to upper 16-bits. |
- */ |
- orr r1, r1, r1, lsl #16 |
- /* |
- * If we're copying > 64 bytes, then we may want to get |
- * onto a 16-byte boundary to improve speed even more. |
- */ |
- cmp r2, #64 |
- blt memset_route |
- ands r12, r0, #0xf |
- beq memset_route |
- /* |
- * Determine the number of bytes to move forward to get to the 16-byte |
- * boundary. Note that this will be a multiple of 4, since we |
- * already are word-aligned. |
- */ |
- rsb r12, r12, #16 |
- sub r2, r2, r12 |
- lsls r12, r12, #29 |
- strmi r1, [r0], #4 |
- strcs r1, [r0], #4 |
- strcs r1, [r0], #4 |
- lsls r12, r12, #2 |
- strhcs r1, [r0], #2 |
-memset_route: |
- /* |
- * Decide where to route for the maximum copy sizes. Note that we |
- * build q0 and q1 depending on if we'll need it, so that's |
- * interwoven here as well. |
- */ |
- vdup.u32 d0, r1 |
- cmp r2, #16 |
- blt memset_8 |
- vmov d1, d0 |
- cmp r2, #64 |
- blt memset_16 |
- vmov q1, q0 |
- cmp r2, #128 |
- blt memset_32 |
-memset_128: |
- mov r12, r2, lsr #7 |
-memset_128_loop: |
- vst1.64 {q0, q1}, [r0]! |
- vst1.64 {q0, q1}, [r0]! |
- vst1.64 {q0, q1}, [r0]! |
- vst1.64 {q0, q1}, [r0]! |
- subs r12, r12, #1 |
- bne memset_128_loop |
- ands r2, r2, #0x7f |
- beq memset_end |
-memset_32: |
- movs r12, r2, lsr #5 |
- beq memset_16 |
-memset_32_loop: |
- subs r12, r12, #1 |
- vst1.64 {q0, q1}, [r0]! |
- bne memset_32_loop |
- ands r2, r2, #0x1f |
- beq memset_end |
-memset_16: |
- movs r12, r2, lsr #4 |
- beq memset_8 |
-memset_16_loop: |
- subs r12, r12, #1 |
- vst1.32 {q0}, [r0]! |
- bne memset_16_loop |
- ands r2, r2, #0xf |
- beq memset_end |
- /* |
- * memset_8 isn't a loop, since we try to do our loops at 16 |
- * bytes and above. We should loop there, then drop down here |
- * to finish the <16-byte versions. Same for memset_4 and |
- * memset_1. |
- */ |
-memset_8: |
- cmp r2, #8 |
- blt memset_4 |
- subs r2, r2, #8 |
- vst1.32 {d0}, [r0]! |
-memset_4: |
- cmp r2, #4 |
- blt memset_2 |
- subs r2, r2, #4 |
- str r1, [r0], #4 |
-memset_2: |
- cmp r2, #0 |
- ble memset_end |
- strh r1, [r0], #2 |
-memset_end: |
- pop {r0} |
- bx lr |
- |
- .end |