OLD | NEW |
(Empty) | |
| 1 ; |
| 2 ; Copyright (c) 2011 The WebM project authors. All Rights Reserved. |
| 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; |
| 10 |
| 11 |
| 12 EXPORT |vpx_variance16x16_media| |
| 13 EXPORT |vpx_variance8x8_media| |
| 14 EXPORT |vpx_mse16x16_media| |
| 15 |
| 16 ARM |
| 17 REQUIRE8 |
| 18 PRESERVE8 |
| 19 |
| 20 AREA ||.text||, CODE, READONLY, ALIGN=2 |
| 21 |
| 22 ; r0 unsigned char *src_ptr |
| 23 ; r1 int source_stride |
| 24 ; r2 unsigned char *ref_ptr |
| 25 ; r3 int recon_stride |
| 26 ; stack unsigned int *sse |
| 27 |vpx_variance16x16_media| PROC |
| 28 |
| 29 stmfd sp!, {r4-r12, lr} |
| 30 |
| 31 pld [r0, r1, lsl #0] |
| 32 pld [r2, r3, lsl #0] |
| 33 |
| 34 mov r8, #0 ; initialize sum = 0 |
| 35 mov r11, #0 ; initialize sse = 0 |
| 36 mov r12, #16 ; set loop counter to 16 (=block height) |
| 37 |
| 38 loop16x16 |
| 39 ; 1st 4 pixels |
| 40 ldr r4, [r0, #0] ; load 4 src pixels |
| 41 ldr r5, [r2, #0] ; load 4 ref pixels |
| 42 |
| 43 mov lr, #0 ; constant zero |
| 44 |
| 45 usub8 r6, r4, r5 ; calculate difference |
| 46 pld [r0, r1, lsl #1] |
| 47 sel r7, r6, lr ; select bytes with positive difference |
| 48 usub8 r9, r5, r4 ; calculate difference with reversed operands |
| 49 pld [r2, r3, lsl #1] |
| 50 sel r6, r9, lr ; select bytes with negative difference |
| 51 |
| 52 ; calculate partial sums |
| 53 usad8 r4, r7, lr ; calculate sum of positive differences |
| 54 usad8 r5, r6, lr ; calculate sum of negative differences |
| 55 orr r6, r6, r7 ; differences of all 4 pixels |
| 56 ; calculate total sum |
| 57 adds r8, r8, r4 ; add positive differences to sum |
| 58 subs r8, r8, r5 ; subtract negative differences from sum |
| 59 |
| 60 ; calculate sse |
| 61 uxtb16 r5, r6 ; byte (two pixels) to halfwords |
| 62 uxtb16 r10, r6, ror #8 ; another two pixels to halfwords |
| 63 smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) |
| 64 |
| 65 ; 2nd 4 pixels |
| 66 ldr r4, [r0, #4] ; load 4 src pixels |
| 67 ldr r5, [r2, #4] ; load 4 ref pixels |
| 68 smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) |
| 69 |
| 70 usub8 r6, r4, r5 ; calculate difference |
| 71 sel r7, r6, lr ; select bytes with positive difference |
| 72 usub8 r9, r5, r4 ; calculate difference with reversed operands |
| 73 sel r6, r9, lr ; select bytes with negative difference |
| 74 |
| 75 ; calculate partial sums |
| 76 usad8 r4, r7, lr ; calculate sum of positive differences |
| 77 usad8 r5, r6, lr ; calculate sum of negative differences |
| 78 orr r6, r6, r7 ; differences of all 4 pixels |
| 79 |
| 80 ; calculate total sum |
| 81 add r8, r8, r4 ; add positive differences to sum |
| 82 sub r8, r8, r5 ; subtract negative differences from sum |
| 83 |
| 84 ; calculate sse |
| 85 uxtb16 r5, r6 ; byte (two pixels) to halfwords |
| 86 uxtb16 r10, r6, ror #8 ; another two pixels to halfwords |
| 87 smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) |
| 88 |
| 89 ; 3rd 4 pixels |
| 90 ldr r4, [r0, #8] ; load 4 src pixels |
| 91 ldr r5, [r2, #8] ; load 4 ref pixels |
| 92 smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) |
| 93 |
| 94 usub8 r6, r4, r5 ; calculate difference |
| 95 sel r7, r6, lr ; select bytes with positive difference |
| 96 usub8 r9, r5, r4 ; calculate difference with reversed operands |
| 97 sel r6, r9, lr ; select bytes with negative difference |
| 98 |
| 99 ; calculate partial sums |
| 100 usad8 r4, r7, lr ; calculate sum of positive differences |
| 101 usad8 r5, r6, lr ; calculate sum of negative differences |
| 102 orr r6, r6, r7 ; differences of all 4 pixels |
| 103 |
| 104 ; calculate total sum |
| 105 add r8, r8, r4 ; add positive differences to sum |
| 106 sub r8, r8, r5 ; subtract negative differences from sum |
| 107 |
| 108 ; calculate sse |
| 109 uxtb16 r5, r6 ; byte (two pixels) to halfwords |
| 110 uxtb16 r10, r6, ror #8 ; another two pixels to halfwords |
| 111 smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) |
| 112 |
| 113 ; 4th 4 pixels |
| 114 ldr r4, [r0, #12] ; load 4 src pixels |
| 115 ldr r5, [r2, #12] ; load 4 ref pixels |
| 116 smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) |
| 117 |
| 118 usub8 r6, r4, r5 ; calculate difference |
| 119 add r0, r0, r1 ; set src_ptr to next row |
| 120 sel r7, r6, lr ; select bytes with positive difference |
| 121 usub8 r9, r5, r4 ; calculate difference with reversed operands |
| 122 add r2, r2, r3 ; set dst_ptr to next row |
| 123 sel r6, r9, lr ; select bytes with negative difference |
| 124 |
| 125 ; calculate partial sums |
| 126 usad8 r4, r7, lr ; calculate sum of positive differences |
| 127 usad8 r5, r6, lr ; calculate sum of negative differences |
| 128 orr r6, r6, r7 ; differences of all 4 pixels |
| 129 |
| 130 ; calculate total sum |
| 131 add r8, r8, r4 ; add positive differences to sum |
| 132 sub r8, r8, r5 ; subtract negative differences from sum |
| 133 |
| 134 ; calculate sse |
| 135 uxtb16 r5, r6 ; byte (two pixels) to halfwords |
| 136 uxtb16 r10, r6, ror #8 ; another two pixels to halfwords |
| 137 smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) |
| 138 smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) |
| 139 |
| 140 |
| 141 subs r12, r12, #1 |
| 142 |
| 143 bne loop16x16 |
| 144 |
| 145 ; return stuff |
| 146 ldr r6, [sp, #40] ; get address of sse |
| 147 mul r0, r8, r8 ; sum * sum |
| 148 str r11, [r6] ; store sse |
| 149 sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) |
| 150 |
| 151 ldmfd sp!, {r4-r12, pc} |
| 152 |
| 153 ENDP |
| 154 |
| 155 ; r0 unsigned char *src_ptr |
| 156 ; r1 int source_stride |
| 157 ; r2 unsigned char *ref_ptr |
| 158 ; r3 int recon_stride |
| 159 ; stack unsigned int *sse |
| 160 |vpx_variance8x8_media| PROC |
| 161 |
| 162 push {r4-r10, lr} |
| 163 |
| 164 pld [r0, r1, lsl #0] |
| 165 pld [r2, r3, lsl #0] |
| 166 |
| 167 mov r12, #8 ; set loop counter to 8 (=block height) |
| 168 mov r4, #0 ; initialize sum = 0 |
| 169 mov r5, #0 ; initialize sse = 0 |
| 170 |
| 171 loop8x8 |
| 172 ; 1st 4 pixels |
| 173 ldr r6, [r0, #0x0] ; load 4 src pixels |
| 174 ldr r7, [r2, #0x0] ; load 4 ref pixels |
| 175 |
| 176 mov lr, #0 ; constant zero |
| 177 |
| 178 usub8 r8, r6, r7 ; calculate difference |
| 179 pld [r0, r1, lsl #1] |
| 180 sel r10, r8, lr ; select bytes with positive difference |
| 181 usub8 r9, r7, r6 ; calculate difference with reversed operands |
| 182 pld [r2, r3, lsl #1] |
| 183 sel r8, r9, lr ; select bytes with negative difference |
| 184 |
| 185 ; calculate partial sums |
| 186 usad8 r6, r10, lr ; calculate sum of positive differences |
| 187 usad8 r7, r8, lr ; calculate sum of negative differences |
| 188 orr r8, r8, r10 ; differences of all 4 pixels |
| 189 ; calculate total sum |
| 190 add r4, r4, r6 ; add positive differences to sum |
| 191 sub r4, r4, r7 ; subtract negative differences from sum |
| 192 |
| 193 ; calculate sse |
| 194 uxtb16 r7, r8 ; byte (two pixels) to halfwords |
| 195 uxtb16 r10, r8, ror #8 ; another two pixels to halfwords |
| 196 smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) |
| 197 |
| 198 ; 2nd 4 pixels |
| 199 ldr r6, [r0, #0x4] ; load 4 src pixels |
| 200 ldr r7, [r2, #0x4] ; load 4 ref pixels |
| 201 smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) |
| 202 |
| 203 usub8 r8, r6, r7 ; calculate difference |
| 204 add r0, r0, r1 ; set src_ptr to next row |
| 205 sel r10, r8, lr ; select bytes with positive difference |
| 206 usub8 r9, r7, r6 ; calculate difference with reversed operands |
| 207 add r2, r2, r3 ; set dst_ptr to next row |
| 208 sel r8, r9, lr ; select bytes with negative difference |
| 209 |
| 210 ; calculate partial sums |
| 211 usad8 r6, r10, lr ; calculate sum of positive differences |
| 212 usad8 r7, r8, lr ; calculate sum of negative differences |
| 213 orr r8, r8, r10 ; differences of all 4 pixels |
| 214 |
| 215 ; calculate total sum |
| 216 add r4, r4, r6 ; add positive differences to sum |
| 217 sub r4, r4, r7 ; subtract negative differences from sum |
| 218 |
| 219 ; calculate sse |
| 220 uxtb16 r7, r8 ; byte (two pixels) to halfwords |
| 221 uxtb16 r10, r8, ror #8 ; another two pixels to halfwords |
| 222 smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) |
| 223 subs r12, r12, #1 ; next row |
| 224 smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) |
| 225 |
| 226 bne loop8x8 |
| 227 |
| 228 ; return stuff |
| 229 ldr r8, [sp, #32] ; get address of sse |
| 230 mul r1, r4, r4 ; sum * sum |
| 231 str r5, [r8] ; store sse |
| 232 sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6)) |
| 233 |
| 234 pop {r4-r10, pc} |
| 235 |
| 236 ENDP |
| 237 |
| 238 ; r0 unsigned char *src_ptr |
| 239 ; r1 int source_stride |
| 240 ; r2 unsigned char *ref_ptr |
| 241 ; r3 int recon_stride |
| 242 ; stack unsigned int *sse |
| 243 ; |
| 244 ;note: Based on vpx_variance16x16_media. In this function, sum is never used. |
| 245 ; So, we can remove this part of calculation. |
| 246 |
| 247 |vpx_mse16x16_media| PROC |
| 248 |
| 249 push {r4-r9, lr} |
| 250 |
| 251 pld [r0, r1, lsl #0] |
| 252 pld [r2, r3, lsl #0] |
| 253 |
| 254 mov r12, #16 ; set loop counter to 16 (=block height) |
| 255 mov r4, #0 ; initialize sse = 0 |
| 256 |
| 257 loopmse |
| 258 ; 1st 4 pixels |
| 259 ldr r5, [r0, #0x0] ; load 4 src pixels |
| 260 ldr r6, [r2, #0x0] ; load 4 ref pixels |
| 261 |
| 262 mov lr, #0 ; constant zero |
| 263 |
| 264 usub8 r8, r5, r6 ; calculate difference |
| 265 pld [r0, r1, lsl #1] |
| 266 sel r7, r8, lr ; select bytes with positive difference |
| 267 usub8 r9, r6, r5 ; calculate difference with reversed operands |
| 268 pld [r2, r3, lsl #1] |
| 269 sel r8, r9, lr ; select bytes with negative difference |
| 270 |
| 271 ; calculate partial sums |
| 272 usad8 r5, r7, lr ; calculate sum of positive differences |
| 273 usad8 r6, r8, lr ; calculate sum of negative differences |
| 274 orr r8, r8, r7 ; differences of all 4 pixels |
| 275 |
| 276 ldr r5, [r0, #0x4] ; load 4 src pixels |
| 277 |
| 278 ; calculate sse |
| 279 uxtb16 r6, r8 ; byte (two pixels) to halfwords |
| 280 uxtb16 r7, r8, ror #8 ; another two pixels to halfwords |
| 281 smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) |
| 282 |
| 283 ; 2nd 4 pixels |
| 284 ldr r6, [r2, #0x4] ; load 4 ref pixels |
| 285 smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) |
| 286 |
| 287 usub8 r8, r5, r6 ; calculate difference |
| 288 sel r7, r8, lr ; select bytes with positive difference |
| 289 usub8 r9, r6, r5 ; calculate difference with reversed operands |
| 290 sel r8, r9, lr ; select bytes with negative difference |
| 291 |
| 292 ; calculate partial sums |
| 293 usad8 r5, r7, lr ; calculate sum of positive differences |
| 294 usad8 r6, r8, lr ; calculate sum of negative differences |
| 295 orr r8, r8, r7 ; differences of all 4 pixels |
| 296 ldr r5, [r0, #0x8] ; load 4 src pixels |
| 297 ; calculate sse |
| 298 uxtb16 r6, r8 ; byte (two pixels) to halfwords |
| 299 uxtb16 r7, r8, ror #8 ; another two pixels to halfwords |
| 300 smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) |
| 301 |
| 302 ; 3rd 4 pixels |
| 303 ldr r6, [r2, #0x8] ; load 4 ref pixels |
| 304 smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) |
| 305 |
| 306 usub8 r8, r5, r6 ; calculate difference |
| 307 sel r7, r8, lr ; select bytes with positive difference |
| 308 usub8 r9, r6, r5 ; calculate difference with reversed operands |
| 309 sel r8, r9, lr ; select bytes with negative difference |
| 310 |
| 311 ; calculate partial sums |
| 312 usad8 r5, r7, lr ; calculate sum of positive differences |
| 313 usad8 r6, r8, lr ; calculate sum of negative differences |
| 314 orr r8, r8, r7 ; differences of all 4 pixels |
| 315 |
| 316 ldr r5, [r0, #0xc] ; load 4 src pixels |
| 317 |
| 318 ; calculate sse |
| 319 uxtb16 r6, r8 ; byte (two pixels) to halfwords |
| 320 uxtb16 r7, r8, ror #8 ; another two pixels to halfwords |
| 321 smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) |
| 322 |
| 323 ; 4th 4 pixels |
| 324 ldr r6, [r2, #0xc] ; load 4 ref pixels |
| 325 smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) |
| 326 |
| 327 usub8 r8, r5, r6 ; calculate difference |
| 328 add r0, r0, r1 ; set src_ptr to next row |
| 329 sel r7, r8, lr ; select bytes with positive difference |
| 330 usub8 r9, r6, r5 ; calculate difference with reversed operands |
| 331 add r2, r2, r3 ; set dst_ptr to next row |
| 332 sel r8, r9, lr ; select bytes with negative difference |
| 333 |
| 334 ; calculate partial sums |
| 335 usad8 r5, r7, lr ; calculate sum of positive differences |
| 336 usad8 r6, r8, lr ; calculate sum of negative differences |
| 337 orr r8, r8, r7 ; differences of all 4 pixels |
| 338 |
| 339 subs r12, r12, #1 ; next row |
| 340 |
| 341 ; calculate sse |
| 342 uxtb16 r6, r8 ; byte (two pixels) to halfwords |
| 343 uxtb16 r7, r8, ror #8 ; another two pixels to halfwords |
| 344 smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) |
| 345 smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) |
| 346 |
| 347 bne loopmse |
| 348 |
| 349 ; return stuff |
| 350 ldr r1, [sp, #28] ; get address of sse |
| 351 mov r0, r4 ; return sse |
| 352 str r4, [r1] ; store sse |
| 353 |
| 354 pop {r4-r9, pc} |
| 355 |
| 356 ENDP |
| 357 |
| 358 END |
OLD | NEW |