OLD | NEW |
(Empty) | |
| 1 ; |
| 2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved. |
| 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; |
| 10 |
| 11 EXPORT |vp9_lpf_horizontal_4_neon| |
| 12 EXPORT |vp9_lpf_vertical_4_neon| |
| 13 ARM |
| 14 |
| 15 AREA ||.text||, CODE, READONLY, ALIGN=2 |
| 16 |
| 17 ; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter |
| 18 ; works on 16 iterations at a time. |
| 19 ; TODO(fgalligan): See about removing the count code as this function is only |
| 20 ; called with a count of 1. |
| 21 ; |
| 22 ; void vp9_lpf_horizontal_4_neon(uint8_t *s, |
| 23 ; int p /* pitch */, |
| 24 ; const uint8_t *blimit, |
| 25 ; const uint8_t *limit, |
| 26 ; const uint8_t *thresh, |
| 27 ; int count) |
| 28 ; |
| 29 ; r0 uint8_t *s, |
| 30 ; r1 int p, /* pitch */ |
| 31 ; r2 const uint8_t *blimit, |
| 32 ; r3 const uint8_t *limit, |
| 33 ; sp const uint8_t *thresh, |
| 34 ; sp+4 int count |
| 35 |vp9_lpf_horizontal_4_neon| PROC |
| 36 push {lr} |
| 37 |
| 38 vld1.8 {d0[]}, [r2] ; duplicate *blimit |
| 39 ldr r12, [sp, #8] ; load count |
| 40 ldr r2, [sp, #4] ; load thresh |
| 41 add r1, r1, r1 ; double pitch |
| 42 |
| 43 cmp r12, #0 |
| 44 beq end_vp9_lf_h_edge |
| 45 |
| 46 vld1.8 {d1[]}, [r3] ; duplicate *limit |
| 47 vld1.8 {d2[]}, [r2] ; duplicate *thresh |
| 48 |
| 49 count_lf_h_loop |
| 50 sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines |
| 51 add r3, r2, r1, lsr #1 ; set to 3 lines down |
| 52 |
| 53 vld1.u8 {d3}, [r2@64], r1 ; p3 |
| 54 vld1.u8 {d4}, [r3@64], r1 ; p2 |
| 55 vld1.u8 {d5}, [r2@64], r1 ; p1 |
| 56 vld1.u8 {d6}, [r3@64], r1 ; p0 |
| 57 vld1.u8 {d7}, [r2@64], r1 ; q0 |
| 58 vld1.u8 {d16}, [r3@64], r1 ; q1 |
| 59 vld1.u8 {d17}, [r2@64] ; q2 |
| 60 vld1.u8 {d18}, [r3@64] ; q3 |
| 61 |
| 62 sub r2, r2, r1, lsl #1 |
| 63 sub r3, r3, r1, lsl #1 |
| 64 |
| 65 bl vp9_loop_filter_neon |
| 66 |
| 67 vst1.u8 {d4}, [r2@64], r1 ; store op1 |
| 68 vst1.u8 {d5}, [r3@64], r1 ; store op0 |
| 69 vst1.u8 {d6}, [r2@64], r1 ; store oq0 |
| 70 vst1.u8 {d7}, [r3@64], r1 ; store oq1 |
| 71 |
| 72 add r0, r0, #8 |
| 73 subs r12, r12, #1 |
| 74 bne count_lf_h_loop |
| 75 |
| 76 end_vp9_lf_h_edge |
| 77 pop {pc} |
| 78 ENDP ; |vp9_lpf_horizontal_4_neon| |
| 79 |
| 80 ; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter |
| 81 ; works on 16 iterations at a time. |
| 82 ; TODO(fgalligan): See about removing the count code as this function is only |
| 83 ; called with a count of 1. |
| 84 ; |
| 85 ; void vp9_lpf_vertical_4_neon(uint8_t *s, |
| 86 ; int p /* pitch */, |
| 87 ; const uint8_t *blimit, |
| 88 ; const uint8_t *limit, |
| 89 ; const uint8_t *thresh, |
| 90 ; int count) |
| 91 ; |
| 92 ; r0 uint8_t *s, |
| 93 ; r1 int p, /* pitch */ |
| 94 ; r2 const uint8_t *blimit, |
| 95 ; r3 const uint8_t *limit, |
| 96 ; sp const uint8_t *thresh, |
| 97 ; sp+4 int count |
| 98 |vp9_lpf_vertical_4_neon| PROC |
| 99 push {lr} |
| 100 |
| 101 vld1.8 {d0[]}, [r2] ; duplicate *blimit |
| 102 ldr r12, [sp, #8] ; load count |
| 103 vld1.8 {d1[]}, [r3] ; duplicate *limit |
| 104 |
| 105 ldr r3, [sp, #4] ; load thresh |
| 106 sub r2, r0, #4 ; move s pointer down by 4 columns |
| 107 cmp r12, #0 |
| 108 beq end_vp9_lf_v_edge |
| 109 |
| 110 vld1.8 {d2[]}, [r3] ; duplicate *thresh |
| 111 |
| 112 count_lf_v_loop |
| 113 vld1.u8 {d3}, [r2], r1 ; load s data |
| 114 vld1.u8 {d4}, [r2], r1 |
| 115 vld1.u8 {d5}, [r2], r1 |
| 116 vld1.u8 {d6}, [r2], r1 |
| 117 vld1.u8 {d7}, [r2], r1 |
| 118 vld1.u8 {d16}, [r2], r1 |
| 119 vld1.u8 {d17}, [r2], r1 |
| 120 vld1.u8 {d18}, [r2] |
| 121 |
| 122 ;transpose to 8x16 matrix |
| 123 vtrn.32 d3, d7 |
| 124 vtrn.32 d4, d16 |
| 125 vtrn.32 d5, d17 |
| 126 vtrn.32 d6, d18 |
| 127 |
| 128 vtrn.16 d3, d5 |
| 129 vtrn.16 d4, d6 |
| 130 vtrn.16 d7, d17 |
| 131 vtrn.16 d16, d18 |
| 132 |
| 133 vtrn.8 d3, d4 |
| 134 vtrn.8 d5, d6 |
| 135 vtrn.8 d7, d16 |
| 136 vtrn.8 d17, d18 |
| 137 |
| 138 bl vp9_loop_filter_neon |
| 139 |
| 140 sub r0, r0, #2 |
| 141 |
| 142 ;store op1, op0, oq0, oq1 |
| 143 vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r0], r1 |
| 144 vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r0], r1 |
| 145 vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r0], r1 |
| 146 vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r0], r1 |
| 147 vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r0], r1 |
| 148 vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r0], r1 |
| 149 vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1 |
| 150 vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0] |
| 151 |
| 152 add r0, r0, r1, lsl #3 ; s += pitch * 8 |
| 153 subs r12, r12, #1 |
| 154 subne r2, r0, #4 ; move s pointer down by 4 columns |
| 155 bne count_lf_v_loop |
| 156 |
| 157 end_vp9_lf_v_edge |
| 158 pop {pc} |
| 159 ENDP ; |vp9_lpf_vertical_4_neon| |
| 160 |
| 161 ; void vp9_loop_filter_neon(); |
| 162 ; This is a helper function for the loopfilters. The invidual functions do the |
| 163 ; necessary load, transpose (if necessary) and store. The function does not use |
| 164 ; registers d8-d15. |
| 165 ; |
| 166 ; Inputs: |
| 167 ; r0-r3, r12 PRESERVE |
| 168 ; d0 blimit |
| 169 ; d1 limit |
| 170 ; d2 thresh |
| 171 ; d3 p3 |
| 172 ; d4 p2 |
| 173 ; d5 p1 |
| 174 ; d6 p0 |
| 175 ; d7 q0 |
| 176 ; d16 q1 |
| 177 ; d17 q2 |
| 178 ; d18 q3 |
| 179 ; |
| 180 ; Outputs: |
| 181 ; d4 op1 |
| 182 ; d5 op0 |
| 183 ; d6 oq0 |
| 184 ; d7 oq1 |
| 185 |vp9_loop_filter_neon| PROC |
| 186 ; filter_mask |
| 187 vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2) |
| 188 vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1) |
| 189 vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0) |
| 190 vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0) |
| 191 vabd.u8 d3, d17, d16 ; m5 = abs(q2 - q1) |
| 192 vabd.u8 d4, d18, d17 ; m6 = abs(q3 - q2) |
| 193 |
| 194 ; only compare the largest value to limit |
| 195 vmax.u8 d19, d19, d20 ; m1 = max(m1, m2) |
| 196 vmax.u8 d20, d21, d22 ; m2 = max(m3, m4) |
| 197 |
| 198 vabd.u8 d17, d6, d7 ; abs(p0 - q0) |
| 199 |
| 200 vmax.u8 d3, d3, d4 ; m3 = max(m5, m6) |
| 201 |
| 202 vmov.u8 d18, #0x80 |
| 203 |
| 204 vmax.u8 d23, d19, d20 ; m1 = max(m1, m2) |
| 205 |
| 206 ; hevmask |
| 207 vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1 |
| 208 vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1 |
| 209 vmax.u8 d23, d23, d3 ; m1 = max(m1, m3) |
| 210 |
| 211 vabd.u8 d28, d5, d16 ; a = abs(p1 - q1) |
| 212 vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2 |
| 213 |
| 214 veor d7, d7, d18 ; qs0 |
| 215 |
| 216 vcge.u8 d23, d1, d23 ; abs(m1) > limit |
| 217 |
| 218 ; filter() function |
| 219 ; convert to signed |
| 220 |
| 221 vshr.u8 d28, d28, #1 ; a = a / 2 |
| 222 veor d6, d6, d18 ; ps0 |
| 223 |
| 224 veor d5, d5, d18 ; ps1 |
| 225 vqadd.u8 d17, d17, d28 ; a = b + a |
| 226 |
| 227 veor d16, d16, d18 ; qs1 |
| 228 |
| 229 vmov.u8 d19, #3 |
| 230 |
| 231 vsub.s8 d28, d7, d6 ; ( qs0 - ps0) |
| 232 |
| 233 vcge.u8 d17, d0, d17 ; a > blimit |
| 234 |
| 235 vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1) |
| 236 vorr d22, d21, d22 ; hevmask |
| 237 |
| 238 vmull.s8 q12, d28, d19 ; 3 * ( qs0 - ps0) |
| 239 |
| 240 vand d27, d27, d22 ; filter &= hev |
| 241 vand d23, d23, d17 ; filter_mask |
| 242 |
| 243 vaddw.s8 q12, q12, d27 ; filter + 3 * (qs0 - ps0) |
| 244 |
| 245 vmov.u8 d17, #4 |
| 246 |
| 247 ; filter = clamp(filter + 3 * ( qs0 - ps0)) |
| 248 vqmovn.s16 d27, q12 |
| 249 |
| 250 vand d27, d27, d23 ; filter &= mask |
| 251 |
| 252 vqadd.s8 d28, d27, d19 ; filter2 = clamp(filter+3) |
| 253 vqadd.s8 d27, d27, d17 ; filter1 = clamp(filter+4) |
| 254 vshr.s8 d28, d28, #3 ; filter2 >>= 3 |
| 255 vshr.s8 d27, d27, #3 ; filter1 >>= 3 |
| 256 |
| 257 vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2) |
| 258 vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1) |
| 259 |
| 260 ; outer tap adjustments |
| 261 vrshr.s8 d27, d27, #1 ; filter = ++filter1 >> 1 |
| 262 |
| 263 veor d6, d26, d18 ; *oq0 = u^0x80 |
| 264 |
| 265 vbic d27, d27, d22 ; filter &= ~hev |
| 266 |
| 267 vqadd.s8 d21, d5, d27 ; u = clamp(ps1 + filter) |
| 268 vqsub.s8 d20, d16, d27 ; u = clamp(qs1 - filter) |
| 269 |
| 270 veor d5, d19, d18 ; *op0 = u^0x80 |
| 271 veor d4, d21, d18 ; *op1 = u^0x80 |
| 272 veor d7, d20, d18 ; *oq1 = u^0x80 |
| 273 |
| 274 bx lr |
| 275 ENDP ; |vp9_loop_filter_neon| |
| 276 |
| 277 END |
OLD | NEW |