| OLD | NEW | 
| (Empty) |  | 
 |    1 /* | 
 |    2  * Copyright (C) 2008 The Android Open Source Project | 
 |    3  * All rights reserved. | 
 |    4  * | 
 |    5  * Redistribution and use in source and binary forms, with or without | 
 |    6  * modification, are permitted provided that the following conditions | 
 |    7  * are met: | 
 |    8  *  * Redistributions of source code must retain the above copyright | 
 |    9  *    notice, this list of conditions and the following disclaimer. | 
 |   10  *  * Redistributions in binary form must reproduce the above copyright | 
 |   11  *    notice, this list of conditions and the following disclaimer in | 
 |   12  *    the documentation and/or other materials provided with the | 
 |   13  *    distribution. | 
 |   14  * | 
 |   15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | 
 |   16  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | 
 |   17  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS | 
 |   18  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE | 
 |   19  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, | 
 |   20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, | 
 |   21  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS | 
 |   22  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED | 
 |   23  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | 
 |   24  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT | 
 |   25  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | 
 |   26  * SUCH DAMAGE. | 
 |   27  */ | 
 |   28  | 
 |   29  | 
 |   30 /* | 
 |   31  * Optimized memcpy() for ARM. | 
 |   32  * | 
 |   33  * note that memcpy() always returns the destination pointer, | 
 |   34  * so we have to preserve R0. | 
 |   35   */ | 
 |   36  | 
 |   37 /* | 
 |   38  * This file has been modified from the original for use in musl libc. | 
 |   39  * The main changes are: addition of .type memcpy,%function to make the | 
 |   40  * code safely callable from thumb mode, adjusting the return | 
 |   41  * instructions to be compatible with pre-thumb ARM cpus, and removal | 
 |   42  * of prefetch code that is not compatible with older cpus. | 
 |   43  */ | 
 |   44  | 
 |   45 .syntax unified | 
 |   46  | 
 |   47 .global memcpy | 
 |   48 .type memcpy,%function | 
 |   49 memcpy: | 
 |   50         /* The stack must always be 64-bits aligned to be compliant with the | 
 |   51          * ARM ABI. Since we have to save R0, we might as well save R4 | 
 |   52          * which we can use for better pipelining of the reads below | 
 |   53          */ | 
 |   54         .fnstart | 
 |   55         .save       {r0, r4, lr} | 
 |   56         stmfd       sp!, {r0, r4, lr} | 
 |   57         /* Making room for r5-r11 which will be spilled later */ | 
 |   58         .pad        #28 | 
 |   59         sub         sp, sp, #28 | 
 |   60  | 
 |   61         /* it simplifies things to take care of len<4 early */ | 
 |   62         cmp     r2, #4 | 
 |   63         blo     copy_last_3_and_return | 
 |   64  | 
 |   65         /* compute the offset to align the source | 
 |   66          * offset = (4-(src&3))&3 = -src & 3 | 
 |   67          */ | 
 |   68         rsb     r3, r1, #0 | 
 |   69         ands    r3, r3, #3 | 
 |   70         beq     src_aligned | 
 |   71  | 
 |   72         /* align source to 32 bits. We need to insert 2 instructions between | 
 |   73          * a ldr[b|h] and str[b|h] because byte and half-word instructions | 
 |   74          * stall 2 cycles. | 
 |   75          */ | 
 |   76         movs    r12, r3, lsl #31 | 
 |   77         sub     r2, r2, r3              /* we know that r3 <= r2 because r2 >= 4
      */ | 
 |   78         ldrbmi r3, [r1], #1 | 
 |   79         ldrbcs r4, [r1], #1 | 
 |   80         ldrbcs r12,[r1], #1 | 
 |   81         strbmi r3, [r0], #1 | 
 |   82         strbcs r4, [r0], #1 | 
 |   83         strbcs r12,[r0], #1 | 
 |   84  | 
 |   85 src_aligned: | 
 |   86  | 
 |   87         /* see if src and dst are aligned together (congruent) */ | 
 |   88         eor     r12, r0, r1 | 
 |   89         tst     r12, #3 | 
 |   90         bne     non_congruent | 
 |   91  | 
 |   92         /* Use post-incriment mode for stm to spill r5-r11 to reserved stack | 
 |   93          * frame. Don't update sp. | 
 |   94          */ | 
 |   95         stmea   sp, {r5-r11} | 
 |   96  | 
 |   97         /* align the destination to a cache-line */ | 
 |   98         rsb     r3, r0, #0 | 
 |   99         ands    r3, r3, #0x1C | 
 |  100         beq     congruent_aligned32 | 
 |  101         cmp     r3, r2 | 
 |  102         andhi   r3, r2, #0x1C | 
 |  103  | 
 |  104         /* conditionnaly copies 0 to 7 words (length in r3) */ | 
 |  105         movs    r12, r3, lsl #28 | 
 |  106         ldmcs   r1!, {r4, r5, r6, r7}           /* 16 bytes */ | 
 |  107         ldmmi   r1!, {r8, r9}                   /*  8 bytes */ | 
 |  108         stmcs   r0!, {r4, r5, r6, r7} | 
 |  109         stmmi   r0!, {r8, r9} | 
 |  110         tst     r3, #0x4 | 
 |  111         ldrne   r10,[r1], #4                    /*  4 bytes */ | 
 |  112         strne   r10,[r0], #4 | 
 |  113         sub     r2, r2, r3 | 
 |  114  | 
 |  115 congruent_aligned32: | 
 |  116         /* | 
 |  117          * here source is aligned to 32 bytes. | 
 |  118          */ | 
 |  119  | 
 |  120 cached_aligned32: | 
 |  121         subs    r2, r2, #32 | 
 |  122         blo     less_than_32_left | 
 |  123  | 
 |  124         /* | 
 |  125          * We preload a cache-line up to 64 bytes ahead. On the 926, this will | 
 |  126          * stall only until the requested world is fetched, but the linefill | 
 |  127          * continues in the the background. | 
 |  128          * While the linefill is going, we write our previous cache-line | 
 |  129          * into the write-buffer (which should have some free space). | 
 |  130          * When the linefill is done, the writebuffer will | 
 |  131          * start dumping its content into memory | 
 |  132          * | 
 |  133          * While all this is going, we then load a full cache line into | 
 |  134          * 8 registers, this cache line should be in the cache by now | 
 |  135          * (or partly in the cache). | 
 |  136          * | 
 |  137          * This code should work well regardless of the source/dest alignment. | 
 |  138          * | 
 |  139          */ | 
 |  140  | 
 |  141         /* Align the preload register to a cache-line because the cpu does | 
 |  142          * "critical word first" (the first word requested is loaded first). | 
 |  143          */ | 
 |  144         @ bic           r12, r1, #0x1F | 
 |  145         @ add           r12, r12, #64 | 
 |  146  | 
 |  147 1:      ldmia   r1!, { r4-r11 } | 
 |  148         subs    r2, r2, #32 | 
 |  149  | 
 |  150         /*  | 
 |  151          * NOTE: if r12 is more than 64 ahead of r1, the following ldrhi | 
 |  152          * for ARM9 preload will not be safely guarded by the preceding subs. | 
 |  153          * When it is safely guarded the only possibility to have SIGSEGV here | 
 |  154          * is because the caller overstates the length. | 
 |  155          */ | 
 |  156         @ ldrhi         r3, [r12], #32      /* cheap ARM9 preload */ | 
 |  157         stmia   r0!, { r4-r11 } | 
 |  158         bhs     1b | 
 |  159  | 
 |  160         add     r2, r2, #32 | 
 |  161  | 
 |  162 less_than_32_left: | 
 |  163         /* | 
 |  164          * less than 32 bytes left at this point (length in r2) | 
 |  165          */ | 
 |  166  | 
 |  167         /* skip all this if there is nothing to do, which should | 
 |  168          * be a common case (if not executed the code below takes | 
 |  169          * about 16 cycles) | 
 |  170          */ | 
 |  171         tst     r2, #0x1F | 
 |  172         beq     1f | 
 |  173  | 
 |  174         /* conditionnaly copies 0 to 31 bytes */ | 
 |  175         movs    r12, r2, lsl #28 | 
 |  176         ldmcs   r1!, {r4, r5, r6, r7}           /* 16 bytes */ | 
 |  177         ldmmi   r1!, {r8, r9}                   /*  8 bytes */ | 
 |  178         stmcs   r0!, {r4, r5, r6, r7} | 
 |  179         stmmi   r0!, {r8, r9} | 
 |  180         movs    r12, r2, lsl #30 | 
 |  181         ldrcs   r3, [r1], #4                    /*  4 bytes */ | 
 |  182         ldrhmi r4, [r1], #2                     /*  2 bytes */ | 
 |  183         strcs   r3, [r0], #4 | 
 |  184         strhmi r4, [r0], #2 | 
 |  185         tst     r2, #0x1 | 
 |  186         ldrbne r3, [r1]                         /*  last byte  */ | 
 |  187         strbne r3, [r0] | 
 |  188  | 
 |  189         /* we're done! restore everything and return */ | 
 |  190 1:      ldmfd   sp!, {r5-r11} | 
 |  191         ldmfd   sp!, {r0, r4, lr} | 
 |  192         bx      lr | 
 |  193  | 
 |  194         /********************************************************************/ | 
 |  195  | 
 |  196 non_congruent: | 
 |  197         /* | 
 |  198          * here source is aligned to 4 bytes | 
 |  199          * but destination is not. | 
 |  200          * | 
 |  201          * in the code below r2 is the number of bytes read | 
 |  202          * (the number of bytes written is always smaller, because we have | 
 |  203          * partial words in the shift queue) | 
 |  204          */ | 
 |  205         cmp     r2, #4 | 
 |  206         blo     copy_last_3_and_return | 
 |  207  | 
 |  208         /* Use post-incriment mode for stm to spill r5-r11 to reserved stack | 
 |  209          * frame. Don't update sp. | 
 |  210          */ | 
 |  211         stmea   sp, {r5-r11} | 
 |  212  | 
 |  213         /* compute shifts needed to align src to dest */ | 
 |  214         rsb     r5, r0, #0 | 
 |  215         and     r5, r5, #3                      /* r5 = # bytes in partial words
      */ | 
 |  216         mov     r12, r5, lsl #3         /* r12 = right */ | 
 |  217         rsb     lr, r12, #32            /* lr = left  */ | 
 |  218  | 
 |  219         /* read the first word */ | 
 |  220         ldr     r3, [r1], #4 | 
 |  221         sub     r2, r2, #4 | 
 |  222  | 
 |  223         /* write a partial word (0 to 3 bytes), such that destination | 
 |  224          * becomes aligned to 32 bits (r5 = nb of words to copy for alignment) | 
 |  225          */ | 
 |  226         movs    r5, r5, lsl #31 | 
 |  227         strbmi r3, [r0], #1 | 
 |  228         movmi   r3, r3, lsr #8 | 
 |  229         strbcs r3, [r0], #1 | 
 |  230         movcs   r3, r3, lsr #8 | 
 |  231         strbcs r3, [r0], #1 | 
 |  232         movcs   r3, r3, lsr #8 | 
 |  233  | 
 |  234         cmp     r2, #4 | 
 |  235         blo     partial_word_tail | 
 |  236  | 
 |  237         /* Align destination to 32 bytes (cache line boundary) */ | 
 |  238 1:      tst     r0, #0x1c | 
 |  239         beq     2f | 
 |  240         ldr     r5, [r1], #4 | 
 |  241         sub     r2, r2, #4 | 
 |  242         orr     r4, r3, r5,             lsl lr | 
 |  243         mov     r3, r5,                 lsr r12 | 
 |  244         str     r4, [r0], #4 | 
 |  245         cmp     r2, #4 | 
 |  246         bhs     1b | 
 |  247         blo     partial_word_tail | 
 |  248  | 
 |  249         /* copy 32 bytes at a time */ | 
 |  250 2:      subs    r2, r2, #32 | 
 |  251         blo     less_than_thirtytwo | 
 |  252  | 
 |  253         /* Use immediate mode for the shifts, because there is an extra cycle | 
 |  254          * for register shifts, which could account for up to 50% of | 
 |  255          * performance hit. | 
 |  256          */ | 
 |  257  | 
 |  258         cmp     r12, #24 | 
 |  259         beq     loop24 | 
 |  260         cmp     r12, #8 | 
 |  261         beq     loop8 | 
 |  262  | 
 |  263 loop16: | 
 |  264         ldr     r12, [r1], #4 | 
 |  265 1:      mov     r4, r12 | 
 |  266         ldmia   r1!, {   r5,r6,r7,  r8,r9,r10,r11} | 
 |  267         subs    r2, r2, #32 | 
 |  268         ldrhs   r12, [r1], #4 | 
 |  269         orr     r3, r3, r4, lsl #16 | 
 |  270         mov     r4, r4, lsr #16 | 
 |  271         orr     r4, r4, r5, lsl #16 | 
 |  272         mov     r5, r5, lsr #16 | 
 |  273         orr     r5, r5, r6, lsl #16 | 
 |  274         mov     r6, r6, lsr #16 | 
 |  275         orr     r6, r6, r7, lsl #16 | 
 |  276         mov     r7, r7, lsr #16 | 
 |  277         orr     r7, r7, r8, lsl #16 | 
 |  278         mov     r8, r8, lsr #16 | 
 |  279         orr     r8, r8, r9, lsl #16 | 
 |  280         mov     r9, r9, lsr #16 | 
 |  281         orr     r9, r9, r10, lsl #16 | 
 |  282         mov     r10, r10,               lsr #16 | 
 |  283         orr     r10, r10, r11, lsl #16 | 
 |  284         stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10} | 
 |  285         mov     r3, r11, lsr #16 | 
 |  286         bhs     1b | 
 |  287         b       less_than_thirtytwo | 
 |  288  | 
 |  289 loop8: | 
 |  290         ldr     r12, [r1], #4 | 
 |  291 1:      mov     r4, r12 | 
 |  292         ldmia   r1!, {   r5,r6,r7,  r8,r9,r10,r11} | 
 |  293         subs    r2, r2, #32 | 
 |  294         ldrhs   r12, [r1], #4 | 
 |  295         orr     r3, r3, r4, lsl #24 | 
 |  296         mov     r4, r4, lsr #8 | 
 |  297         orr     r4, r4, r5, lsl #24 | 
 |  298         mov     r5, r5, lsr #8 | 
 |  299         orr     r5, r5, r6, lsl #24 | 
 |  300         mov     r6, r6,  lsr #8 | 
 |  301         orr     r6, r6, r7, lsl #24 | 
 |  302         mov     r7, r7,  lsr #8 | 
 |  303         orr     r7, r7, r8,             lsl #24 | 
 |  304         mov     r8, r8,  lsr #8 | 
 |  305         orr     r8, r8, r9,             lsl #24 | 
 |  306         mov     r9, r9,  lsr #8 | 
 |  307         orr     r9, r9, r10,    lsl #24 | 
 |  308         mov     r10, r10, lsr #8 | 
 |  309         orr     r10, r10, r11,  lsl #24 | 
 |  310         stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10} | 
 |  311         mov     r3, r11, lsr #8 | 
 |  312         bhs     1b | 
 |  313         b       less_than_thirtytwo | 
 |  314  | 
 |  315 loop24: | 
 |  316         ldr     r12, [r1], #4 | 
 |  317 1:      mov     r4, r12 | 
 |  318         ldmia   r1!, {   r5,r6,r7,  r8,r9,r10,r11} | 
 |  319         subs    r2, r2, #32 | 
 |  320         ldrhs   r12, [r1], #4 | 
 |  321         orr     r3, r3, r4, lsl #8 | 
 |  322         mov     r4, r4, lsr #24 | 
 |  323         orr     r4, r4, r5, lsl #8 | 
 |  324         mov     r5, r5, lsr #24 | 
 |  325         orr     r5, r5, r6, lsl #8 | 
 |  326         mov     r6, r6, lsr #24 | 
 |  327         orr     r6, r6, r7, lsl #8 | 
 |  328         mov     r7, r7, lsr #24 | 
 |  329         orr     r7, r7, r8, lsl #8 | 
 |  330         mov     r8, r8, lsr #24 | 
 |  331         orr     r8, r8, r9, lsl #8 | 
 |  332         mov     r9, r9, lsr #24 | 
 |  333         orr     r9, r9, r10, lsl #8 | 
 |  334         mov     r10, r10, lsr #24 | 
 |  335         orr     r10, r10, r11, lsl #8 | 
 |  336         stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10} | 
 |  337         mov     r3, r11, lsr #24 | 
 |  338         bhs     1b | 
 |  339  | 
 |  340 less_than_thirtytwo: | 
 |  341         /* copy the last 0 to 31 bytes of the source */ | 
 |  342         rsb     r12, lr, #32            /* we corrupted r12, recompute it  */ | 
 |  343         add     r2, r2, #32 | 
 |  344         cmp     r2, #4 | 
 |  345         blo     partial_word_tail | 
 |  346  | 
 |  347 1:      ldr     r5, [r1], #4 | 
 |  348         sub     r2, r2, #4 | 
 |  349         orr     r4, r3, r5,             lsl lr | 
 |  350         mov     r3,     r5,                     lsr r12 | 
 |  351         str     r4, [r0], #4 | 
 |  352         cmp     r2, #4 | 
 |  353         bhs     1b | 
 |  354  | 
 |  355 partial_word_tail: | 
 |  356         /* we have a partial word in the input buffer */ | 
 |  357         movs    r5, lr, lsl #(31-3) | 
 |  358         strbmi r3, [r0], #1 | 
 |  359         movmi   r3, r3, lsr #8 | 
 |  360         strbcs r3, [r0], #1 | 
 |  361         movcs   r3, r3, lsr #8 | 
 |  362         strbcs r3, [r0], #1 | 
 |  363  | 
 |  364         /* Refill spilled registers from the stack. Don't update sp. */ | 
 |  365         ldmfd   sp, {r5-r11} | 
 |  366  | 
 |  367 copy_last_3_and_return: | 
 |  368         movs    r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */ | 
 |  369         ldrbmi r2, [r1], #1 | 
 |  370         ldrbcs r3, [r1], #1 | 
 |  371         ldrbcs r12,[r1] | 
 |  372         strbmi r2, [r0], #1 | 
 |  373         strbcs r3, [r0], #1 | 
 |  374         strbcs r12,[r0] | 
 |  375  | 
 |  376         /* we're done! restore sp and spilled registers and return */ | 
 |  377         add     sp,  sp, #28 | 
 |  378         ldmfd   sp!, {r0, r4, lr} | 
 |  379         bx      lr | 
| OLD | NEW |