| Index: source/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm | 
| =================================================================== | 
| --- source/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm	(revision 96967) | 
| +++ source/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm	(working copy) | 
| @@ -53,14 +53,11 @@ | 
|  | 
| ;r0     unsigned char *src_ptr, | 
| ;r1     int src_pixel_step, | 
| -;r2     const char *flimit, | 
| +;r2     const char *blimit, | 
| ;r3     const char *limit, | 
| ;stack  const char *thresh, | 
| ;stack  int  count | 
|  | 
| -;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed | 
| -;for flimit. Same way applies to limit and thresh. | 
| - | 
| ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- | 
| |vp8_loop_filter_horizontal_edge_armv6| PROC | 
| ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- | 
| @@ -72,14 +69,18 @@ | 
| sub         sp, sp, #16                 ; create temp buffer | 
|  | 
| ldr         r9, [src], pstep            ; p3 | 
| -    ldr         r4, [r2], #4                ; flimit | 
| +    ldrb        r4, [r2]                    ; blimit | 
| ldr         r10, [src], pstep           ; p2 | 
| -    ldr         r2, [r3], #4                ; limit | 
| +    ldrb        r2, [r3]                    ; limit | 
| ldr         r11, [src], pstep           ; p1 | 
| -    uadd8       r4, r4, r4                  ; flimit * 2 | 
| -    ldr         r3, [r6], #4                ; thresh | 
| +    orr         r4, r4, r4, lsl #8 | 
| +    ldrb        r3, [r6]                    ; thresh | 
| +    orr         r2, r2, r2, lsl #8 | 
| mov         count, count, lsl #1        ; 4-in-parallel | 
| -    uadd8       r4, r4, r2                  ; flimit * 2 + limit | 
| +    orr         r4, r4, r4, lsl #16 | 
| +    orr         r3, r3, r3, lsl #8 | 
| +    orr         r2, r2, r2, lsl #16 | 
| +    orr         r3, r3, r3, lsl #16 | 
|  | 
| |Hnext8| | 
| ; vp8_filter_mask() function | 
| @@ -253,12 +254,6 @@ | 
|  | 
| subs        count, count, #1 | 
|  | 
| -    ;pld            [src] | 
| -    ;pld            [src, pstep] | 
| -    ;pld            [src, pstep, lsl #1] | 
| -    ;pld            [src, pstep, lsl #2] | 
| -    ;pld            [src, pstep, lsl #3] | 
| - | 
| ldrne       r9, [src], pstep            ; p3 | 
| ldrne       r10, [src], pstep           ; p2 | 
| ldrne       r11, [src], pstep           ; p1 | 
| @@ -281,14 +276,18 @@ | 
| sub         sp, sp, #16                 ; create temp buffer | 
|  | 
| ldr         r9, [src], pstep            ; p3 | 
| -    ldr         r4, [r2], #4                ; flimit | 
| +    ldrb        r4, [r2]                    ; blimit | 
| ldr         r10, [src], pstep           ; p2 | 
| -    ldr         r2, [r3], #4                ; limit | 
| +    ldrb        r2, [r3]                    ; limit | 
| ldr         r11, [src], pstep           ; p1 | 
| -    uadd8       r4, r4, r4                  ; flimit * 2 | 
| -    ldr         r3, [r6], #4                ; thresh | 
| +    orr         r4, r4, r4, lsl #8 | 
| +    ldrb        r3, [r6]                    ; thresh | 
| +    orr         r2, r2, r2, lsl #8 | 
| mov         count, count, lsl #1        ; 4-in-parallel | 
| -    uadd8       r4, r4, r2                  ; flimit * 2 + limit | 
| +    orr         r4, r4, r4, lsl #16 | 
| +    orr         r3, r3, r3, lsl #8 | 
| +    orr         r2, r2, r2, lsl #16 | 
| +    orr         r3, r3, r3, lsl #16 | 
|  | 
| |MBHnext8| | 
|  | 
| @@ -590,15 +589,19 @@ | 
| sub         sp, sp, #16                 ; create temp buffer | 
|  | 
| ldr         r6, [src], pstep            ; load source data | 
| -    ldr         r4, [r2], #4                ; flimit | 
| +    ldrb        r4, [r2]                    ; blimit | 
| ldr         r7, [src], pstep | 
| -    ldr         r2, [r3], #4                ; limit | 
| +    ldrb        r2, [r3]                    ; limit | 
| ldr         r8, [src], pstep | 
| -    uadd8       r4, r4, r4                  ; flimit * 2 | 
| -    ldr         r3, [r12], #4               ; thresh | 
| +    orr         r4, r4, r4, lsl #8 | 
| +    ldrb        r3, [r12]                   ; thresh | 
| +    orr         r2, r2, r2, lsl #8 | 
| ldr         lr, [src], pstep | 
| mov         count, count, lsl #1        ; 4-in-parallel | 
| -    uadd8       r4, r4, r2                  ; flimit * 2 + limit | 
| +    orr         r4, r4, r4, lsl #16 | 
| +    orr         r3, r3, r3, lsl #8 | 
| +    orr         r2, r2, r2, lsl #16 | 
| +    orr         r3, r3, r3, lsl #16 | 
|  | 
| |Vnext8| | 
|  | 
| @@ -857,18 +860,26 @@ | 
| sub         src, src, #4                ; move src pointer down by 4 | 
| ldr         count, [sp, #40]            ; count for 8-in-parallel | 
| ldr         r12, [sp, #36]              ; load thresh address | 
| +    pld         [src, #23]                  ; preload for next block | 
| sub         sp, sp, #16                 ; create temp buffer | 
|  | 
| ldr         r6, [src], pstep            ; load source data | 
| -    ldr         r4, [r2], #4                ; flimit | 
| +    ldrb        r4, [r2]                    ; blimit | 
| +    pld         [src, #23] | 
| ldr         r7, [src], pstep | 
| -    ldr         r2, [r3], #4                ; limit | 
| +    ldrb        r2, [r3]                    ; limit | 
| +    pld         [src, #23] | 
| ldr         r8, [src], pstep | 
| -    uadd8       r4, r4, r4                  ; flimit * 2 | 
| -    ldr         r3, [r12], #4               ; thresh | 
| +    orr         r4, r4, r4, lsl #8 | 
| +    ldrb        r3, [r12]                   ; thresh | 
| +    orr         r2, r2, r2, lsl #8 | 
| +    pld         [src, #23] | 
| ldr         lr, [src], pstep | 
| mov         count, count, lsl #1        ; 4-in-parallel | 
| -    uadd8       r4, r4, r2                  ; flimit * 2 + limit | 
| +    orr         r4, r4, r4, lsl #16 | 
| +    orr         r3, r3, r3, lsl #8 | 
| +    orr         r2, r2, r2, lsl #16 | 
| +    orr         r3, r3, r3, lsl #16 | 
|  | 
| |MBVnext8| | 
| ; vp8_filter_mask() function | 
| @@ -908,6 +919,7 @@ | 
| str         lr, [sp, #8] | 
| ldr         lr, [src], pstep | 
|  | 
| + | 
| TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 | 
|  | 
| ldr         lr, [sp, #8]                ; load back (f)limit accumulator | 
| @@ -956,6 +968,7 @@ | 
| beq         mbvskip_filter               ; skip filtering | 
|  | 
|  | 
| + | 
| ;vp8_hevmask() function | 
| ;calculate high edge variance | 
|  | 
| @@ -1123,6 +1136,7 @@ | 
| smlabb      r8, r6, lr, r7 | 
| smlatb      r6, r6, lr, r7 | 
| smlabb      r9, r10, lr, r7 | 
| + | 
| smlatb      r10, r10, lr, r7 | 
| ssat        r8, #8, r8, asr #7 | 
| ssat        r6, #8, r6, asr #7 | 
| @@ -1242,9 +1256,13 @@ | 
| sub         src, src, #4 | 
| subs        count, count, #1 | 
|  | 
| +    pld         [src, #23]                  ; preload for next block | 
| ldrne       r6, [src], pstep            ; load source data | 
| +    pld         [src, #23] | 
| ldrne       r7, [src], pstep | 
| +    pld         [src, #23] | 
| ldrne       r8, [src], pstep | 
| +    pld         [src, #23] | 
| ldrne       lr, [src], pstep | 
|  | 
| bne         MBVnext8 | 
|  |