| OLD | NEW | 
|---|
|  | (Empty) | 
| 1 ; |  | 
| 2 ;  Copyright (c) 2013 The WebM project authors. All Rights Reserved. |  | 
| 3 ; |  | 
| 4 ;  Use of this source code is governed by a BSD-style license |  | 
| 5 ;  that can be found in the LICENSE file in the root of the source |  | 
| 6 ;  tree. An additional intellectual property rights grant can be found |  | 
| 7 ;  in the file PATENTS.  All contributing project authors may |  | 
| 8 ;  be found in the AUTHORS file in the root of the source tree. |  | 
| 9 ; |  | 
| 10 |  | 
| 11 |  | 
| 12     ; These functions are only valid when: |  | 
| 13     ; x_step_q4 == 16 |  | 
| 14     ; w%4 == 0 |  | 
| 15     ; h%4 == 0 |  | 
| 16     ; taps == 8 |  | 
| 17     ; VP9_FILTER_WEIGHT == 128 |  | 
| 18     ; VP9_FILTER_SHIFT == 7 |  | 
| 19 |  | 
| 20     EXPORT  |vp9_convolve8_avg_horiz_neon| |  | 
| 21     EXPORT  |vp9_convolve8_avg_vert_neon| |  | 
| 22     IMPORT  |vp9_convolve8_avg_horiz_c| |  | 
| 23     IMPORT  |vp9_convolve8_avg_vert_c| |  | 
| 24     ARM |  | 
| 25     REQUIRE8 |  | 
| 26     PRESERVE8 |  | 
| 27 |  | 
| 28     AREA ||.text||, CODE, READONLY, ALIGN=2 |  | 
| 29 |  | 
| 30     ; Multiply and accumulate by q0 |  | 
| 31     MACRO |  | 
| 32     MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7 |  | 
| 33     vmull.s16 $dst, $src0, d0[0] |  | 
| 34     vmlal.s16 $dst, $src1, d0[1] |  | 
| 35     vmlal.s16 $dst, $src2, d0[2] |  | 
| 36     vmlal.s16 $dst, $src3, d0[3] |  | 
| 37     vmlal.s16 $dst, $src4, d1[0] |  | 
| 38     vmlal.s16 $dst, $src5, d1[1] |  | 
| 39     vmlal.s16 $dst, $src6, d1[2] |  | 
| 40     vmlal.s16 $dst, $src7, d1[3] |  | 
| 41     MEND |  | 
| 42 |  | 
| 43 ; r0    const uint8_t *src |  | 
| 44 ; r1    int src_stride |  | 
| 45 ; r2    uint8_t *dst |  | 
| 46 ; r3    int dst_stride |  | 
| 47 ; sp[]const int16_t *filter_x |  | 
| 48 ; sp[]int x_step_q4 |  | 
| 49 ; sp[]const int16_t *filter_y ; unused |  | 
| 50 ; sp[]int y_step_q4           ; unused |  | 
| 51 ; sp[]int w |  | 
| 52 ; sp[]int h |  | 
| 53 |  | 
| 54 |vp9_convolve8_avg_horiz_neon| PROC |  | 
| 55     ldr             r12, [sp, #4]           ; x_step_q4 |  | 
| 56     cmp             r12, #16 |  | 
| 57     bne             vp9_convolve8_avg_horiz_c |  | 
| 58 |  | 
| 59     push            {r4-r10, lr} |  | 
| 60 |  | 
| 61     sub             r0, r0, #3              ; adjust for taps |  | 
| 62 |  | 
| 63     ldr             r5, [sp, #32]           ; filter_x |  | 
| 64     ldr             r6, [sp, #48]           ; w |  | 
| 65     ldr             r7, [sp, #52]           ; h |  | 
| 66 |  | 
| 67     vld1.s16        {q0}, [r5]              ; filter_x |  | 
| 68 |  | 
| 69     sub             r8, r1, r1, lsl #2      ; -src_stride * 3 |  | 
| 70     add             r8, r8, #4              ; -src_stride * 3 + 4 |  | 
| 71 |  | 
| 72     sub             r4, r3, r3, lsl #2      ; -dst_stride * 3 |  | 
| 73     add             r4, r4, #4              ; -dst_stride * 3 + 4 |  | 
| 74 |  | 
| 75     rsb             r9, r6, r1, lsl #2      ; reset src for outer loop |  | 
| 76     sub             r9, r9, #7 |  | 
| 77     rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop |  | 
| 78 |  | 
| 79     mov             r10, r6                 ; w loop counter |  | 
| 80 |  | 
| 81 vp9_convolve8_avg_loop_horiz_v |  | 
| 82     vld1.8          {d24}, [r0], r1 |  | 
| 83     vld1.8          {d25}, [r0], r1 |  | 
| 84     vld1.8          {d26}, [r0], r1 |  | 
| 85     vld1.8          {d27}, [r0], r8 |  | 
| 86 |  | 
| 87     vtrn.16         q12, q13 |  | 
| 88     vtrn.8          d24, d25 |  | 
| 89     vtrn.8          d26, d27 |  | 
| 90 |  | 
| 91     pld             [r0, r1, lsl #2] |  | 
| 92 |  | 
| 93     vmovl.u8        q8, d24 |  | 
| 94     vmovl.u8        q9, d25 |  | 
| 95     vmovl.u8        q10, d26 |  | 
| 96     vmovl.u8        q11, d27 |  | 
| 97 |  | 
| 98     ; save a few instructions in the inner loop |  | 
| 99     vswp            d17, d18 |  | 
| 100     vmov            d23, d21 |  | 
| 101 |  | 
| 102     add             r0, r0, #3 |  | 
| 103 |  | 
| 104 vp9_convolve8_avg_loop_horiz |  | 
| 105     add             r5, r0, #64 |  | 
| 106 |  | 
| 107     vld1.32         {d28[]}, [r0], r1 |  | 
| 108     vld1.32         {d29[]}, [r0], r1 |  | 
| 109     vld1.32         {d31[]}, [r0], r1 |  | 
| 110     vld1.32         {d30[]}, [r0], r8 |  | 
| 111 |  | 
| 112     pld             [r5] |  | 
| 113 |  | 
| 114     vtrn.16         d28, d31 |  | 
| 115     vtrn.16         d29, d30 |  | 
| 116     vtrn.8          d28, d29 |  | 
| 117     vtrn.8          d31, d30 |  | 
| 118 |  | 
| 119     pld             [r5, r1] |  | 
| 120 |  | 
| 121     ; extract to s16 |  | 
| 122     vtrn.32         q14, q15 |  | 
| 123     vmovl.u8        q12, d28 |  | 
| 124     vmovl.u8        q13, d29 |  | 
| 125 |  | 
| 126     pld             [r5, r1, lsl #1] |  | 
| 127 |  | 
| 128     ; slightly out of order load to match the existing data |  | 
| 129     vld1.u32        {d6[0]}, [r2], r3 |  | 
| 130     vld1.u32        {d7[0]}, [r2], r3 |  | 
| 131     vld1.u32        {d6[1]}, [r2], r3 |  | 
| 132     vld1.u32        {d7[1]}, [r2], r3 |  | 
| 133 |  | 
| 134     sub             r2, r2, r3, lsl #2      ; reset for store |  | 
| 135 |  | 
| 136     ; src[] * filter_x |  | 
| 137     MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24 |  | 
| 138     MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26 |  | 
| 139     MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27 |  | 
| 140     MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25 |  | 
| 141 |  | 
| 142     pld             [r5, -r8] |  | 
| 143 |  | 
| 144     ; += 64 >> 7 |  | 
| 145     vqrshrun.s32    d2, q1, #7 |  | 
| 146     vqrshrun.s32    d3, q2, #7 |  | 
| 147     vqrshrun.s32    d4, q14, #7 |  | 
| 148     vqrshrun.s32    d5, q15, #7 |  | 
| 149 |  | 
| 150     ; saturate |  | 
| 151     vqmovn.u16      d2, q1 |  | 
| 152     vqmovn.u16      d3, q2 |  | 
| 153 |  | 
| 154     ; transpose |  | 
| 155     vtrn.16         d2, d3 |  | 
| 156     vtrn.32         d2, d3 |  | 
| 157     vtrn.8          d2, d3 |  | 
| 158 |  | 
| 159     ; average the new value and the dst value |  | 
| 160     vrhadd.u8       q1, q1, q3 |  | 
| 161 |  | 
| 162     vst1.u32        {d2[0]}, [r2@32], r3 |  | 
| 163     vst1.u32        {d3[0]}, [r2@32], r3 |  | 
| 164     vst1.u32        {d2[1]}, [r2@32], r3 |  | 
| 165     vst1.u32        {d3[1]}, [r2@32], r4 |  | 
| 166 |  | 
| 167     vmov            q8,  q9 |  | 
| 168     vmov            d20, d23 |  | 
| 169     vmov            q11, q12 |  | 
| 170     vmov            q9,  q13 |  | 
| 171 |  | 
| 172     subs            r6, r6, #4              ; w -= 4 |  | 
| 173     bgt             vp9_convolve8_avg_loop_horiz |  | 
| 174 |  | 
| 175     ; outer loop |  | 
| 176     mov             r6, r10                 ; restore w counter |  | 
| 177     add             r0, r0, r9              ; src += src_stride * 4 - w |  | 
| 178     add             r2, r2, r12             ; dst += dst_stride * 4 - w |  | 
| 179     subs            r7, r7, #4              ; h -= 4 |  | 
| 180     bgt vp9_convolve8_avg_loop_horiz_v |  | 
| 181 |  | 
| 182     pop             {r4-r10, pc} |  | 
| 183 |  | 
| 184     ENDP |  | 
| 185 |  | 
| 186 |vp9_convolve8_avg_vert_neon| PROC |  | 
| 187     ldr             r12, [sp, #12] |  | 
| 188     cmp             r12, #16 |  | 
| 189     bne             vp9_convolve8_avg_vert_c |  | 
| 190 |  | 
| 191     push            {r4-r8, lr} |  | 
| 192 |  | 
| 193     ; adjust for taps |  | 
| 194     sub             r0, r0, r1 |  | 
| 195     sub             r0, r0, r1, lsl #1 |  | 
| 196 |  | 
| 197     ldr             r4, [sp, #32]           ; filter_y |  | 
| 198     ldr             r6, [sp, #40]           ; w |  | 
| 199     ldr             lr, [sp, #44]           ; h |  | 
| 200 |  | 
| 201     vld1.s16        {q0}, [r4]              ; filter_y |  | 
| 202 |  | 
| 203     lsl             r1, r1, #1 |  | 
| 204     lsl             r3, r3, #1 |  | 
| 205 |  | 
| 206 vp9_convolve8_avg_loop_vert_h |  | 
| 207     mov             r4, r0 |  | 
| 208     add             r7, r0, r1, asr #1 |  | 
| 209     mov             r5, r2 |  | 
| 210     add             r8, r2, r3, asr #1 |  | 
| 211     mov             r12, lr                 ; h loop counter |  | 
| 212 |  | 
| 213     vld1.u32        {d16[0]}, [r4], r1 |  | 
| 214     vld1.u32        {d16[1]}, [r7], r1 |  | 
| 215     vld1.u32        {d18[0]}, [r4], r1 |  | 
| 216     vld1.u32        {d18[1]}, [r7], r1 |  | 
| 217     vld1.u32        {d20[0]}, [r4], r1 |  | 
| 218     vld1.u32        {d20[1]}, [r7], r1 |  | 
| 219     vld1.u32        {d22[0]}, [r4], r1 |  | 
| 220 |  | 
| 221     vmovl.u8        q8, d16 |  | 
| 222     vmovl.u8        q9, d18 |  | 
| 223     vmovl.u8        q10, d20 |  | 
| 224     vmovl.u8        q11, d22 |  | 
| 225 |  | 
| 226 vp9_convolve8_avg_loop_vert |  | 
| 227     ; always process a 4x4 block at a time |  | 
| 228     vld1.u32        {d24[0]}, [r7], r1 |  | 
| 229     vld1.u32        {d26[0]}, [r4], r1 |  | 
| 230     vld1.u32        {d26[1]}, [r7], r1 |  | 
| 231     vld1.u32        {d24[1]}, [r4], r1 |  | 
| 232 |  | 
| 233     ; extract to s16 |  | 
| 234     vmovl.u8        q12, d24 |  | 
| 235     vmovl.u8        q13, d26 |  | 
| 236 |  | 
| 237     vld1.u32        {d6[0]}, [r5@32], r3 |  | 
| 238     vld1.u32        {d6[1]}, [r8@32], r3 |  | 
| 239     vld1.u32        {d7[0]}, [r5@32], r3 |  | 
| 240     vld1.u32        {d7[1]}, [r8@32], r3 |  | 
| 241 |  | 
| 242     pld             [r7] |  | 
| 243     pld             [r4] |  | 
| 244 |  | 
| 245     ; src[] * filter_y |  | 
| 246     MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24 |  | 
| 247 |  | 
| 248     pld             [r7, r1] |  | 
| 249     pld             [r4, r1] |  | 
| 250 |  | 
| 251     MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26 |  | 
| 252 |  | 
| 253     pld             [r5] |  | 
| 254     pld             [r8] |  | 
| 255 |  | 
| 256     MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27 |  | 
| 257 |  | 
| 258     pld             [r5, r3] |  | 
| 259     pld             [r8, r3] |  | 
| 260 |  | 
| 261     MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25 |  | 
| 262 |  | 
| 263     ; += 64 >> 7 |  | 
| 264     vqrshrun.s32    d2, q1, #7 |  | 
| 265     vqrshrun.s32    d3, q2, #7 |  | 
| 266     vqrshrun.s32    d4, q14, #7 |  | 
| 267     vqrshrun.s32    d5, q15, #7 |  | 
| 268 |  | 
| 269     ; saturate |  | 
| 270     vqmovn.u16      d2, q1 |  | 
| 271     vqmovn.u16      d3, q2 |  | 
| 272 |  | 
| 273     ; average the new value and the dst value |  | 
| 274     vrhadd.u8       q1, q1, q3 |  | 
| 275 |  | 
| 276     sub             r5, r5, r3, lsl #1      ; reset for store |  | 
| 277     sub             r8, r8, r3, lsl #1 |  | 
| 278 |  | 
| 279     vst1.u32        {d2[0]}, [r5@32], r3 |  | 
| 280     vst1.u32        {d2[1]}, [r8@32], r3 |  | 
| 281     vst1.u32        {d3[0]}, [r5@32], r3 |  | 
| 282     vst1.u32        {d3[1]}, [r8@32], r3 |  | 
| 283 |  | 
| 284     vmov            q8, q10 |  | 
| 285     vmov            d18, d22 |  | 
| 286     vmov            d19, d24 |  | 
| 287     vmov            q10, q13 |  | 
| 288     vmov            d22, d25 |  | 
| 289 |  | 
| 290     subs            r12, r12, #4            ; h -= 4 |  | 
| 291     bgt             vp9_convolve8_avg_loop_vert |  | 
| 292 |  | 
| 293     ; outer loop |  | 
| 294     add             r0, r0, #4 |  | 
| 295     add             r2, r2, #4 |  | 
| 296     subs            r6, r6, #4              ; w -= 4 |  | 
| 297     bgt             vp9_convolve8_avg_loop_vert_h |  | 
| 298 |  | 
| 299     pop             {r4-r8, pc} |  | 
| 300 |  | 
| 301     ENDP |  | 
| 302     END |  | 
| OLD | NEW | 
|---|