| Index: source/libvpx/vp8/common/arm/armv6/filter_v6.asm | 
| =================================================================== | 
| --- source/libvpx/vp8/common/arm/armv6/filter_v6.asm	(revision 96967) | 
| +++ source/libvpx/vp8/common/arm/armv6/filter_v6.asm	(working copy) | 
| @@ -10,6 +10,8 @@ | 
|  | 
|  | 
| EXPORT  |vp8_filter_block2d_first_pass_armv6| | 
| +    EXPORT  |vp8_filter_block2d_first_pass_16x16_armv6| | 
| +    EXPORT  |vp8_filter_block2d_first_pass_8x8_armv6| | 
| EXPORT  |vp8_filter_block2d_second_pass_armv6| | 
| EXPORT  |vp8_filter4_block2d_second_pass_armv6| | 
| EXPORT  |vp8_filter_block2d_first_pass_only_armv6| | 
| @@ -40,11 +42,6 @@ | 
| add     r12, r3, #16                    ; square off the output | 
| sub     sp, sp, #4 | 
|  | 
| -    ;;IF ARCHITECTURE=6 | 
| -    ;pld        [r0, #-2] | 
| -    ;;pld       [r0, #30] | 
| -    ;;ENDIF | 
| - | 
| ldr     r4, [r11]                       ; load up packed filter coefficients | 
| ldr     r5, [r11, #4] | 
| ldr     r6, [r11, #8] | 
| @@ -101,15 +98,10 @@ | 
|  | 
| bne     width_loop_1st_6 | 
|  | 
| -    ;;add       r9, r2, #30                 ; attempt to load 2 adjacent cache lines | 
| -    ;;IF ARCHITECTURE=6 | 
| -    ;pld        [r0, r2] | 
| -    ;;pld       [r0, r9] | 
| -    ;;ENDIF | 
| - | 
| ldr     r1, [sp]                        ; load and update dst address | 
| subs    r7, r7, #0x10000 | 
| add     r0, r0, r2                      ; move to next input line | 
| + | 
| add     r1, r1, #2                      ; move over to next column | 
| str     r1, [sp] | 
|  | 
| @@ -120,6 +112,192 @@ | 
|  | 
| ENDP | 
|  | 
| +; -------------------------- | 
| +; 16x16 version | 
| +; ----------------------------- | 
| +|vp8_filter_block2d_first_pass_16x16_armv6| PROC | 
| +    stmdb   sp!, {r4 - r11, lr} | 
| + | 
| +    ldr     r11, [sp, #40]                  ; vp8_filter address | 
| +    ldr     r7, [sp, #36]                   ; output height | 
| + | 
| +    add     r4, r2, #18                     ; preload next low | 
| +    pld     [r0, r4] | 
| + | 
| +    sub     r2, r2, r3                      ; inside loop increments input array, | 
| +                                            ; so the height loop only needs to add | 
| +                                            ; r2 - width to the input pointer | 
| + | 
| +    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts | 
| +    add     r12, r3, #16                    ; square off the output | 
| +    sub     sp, sp, #4 | 
| + | 
| +    ldr     r4, [r11]                       ; load up packed filter coefficients | 
| +    ldr     r5, [r11, #4] | 
| +    ldr     r6, [r11, #8] | 
| + | 
| +    str     r1, [sp]                        ; push destination to stack | 
| +    mov     r7, r7, lsl #16                 ; height is top part of counter | 
| + | 
| +; six tap filter | 
| +|height_loop_1st_16_6| | 
| +    ldrb    r8, [r0, #-2]                   ; load source data | 
| +    ldrb    r9, [r0, #-1] | 
| +    ldrb    r10, [r0], #2 | 
| +    orr     r7, r7, r3, lsr #2              ; construct loop counter | 
| + | 
| +|width_loop_1st_16_6| | 
| +    ldrb    r11, [r0, #-1] | 
| + | 
| +    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8 | 
| +    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9 | 
| + | 
| +    ldrb    r9, [r0] | 
| + | 
| +    smuad   lr, lr, r4                      ; apply the filter | 
| +    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10 | 
| +    smuad   r8, r8, r4 | 
| +    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11 | 
| + | 
| +    smlad   lr, r10, r5, lr | 
| +    ldrb    r10, [r0, #1] | 
| +    smlad   r8, r11, r5, r8 | 
| +    ldrb    r11, [r0, #2] | 
| + | 
| +    sub     r7, r7, #1 | 
| + | 
| +    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9 | 
| +    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10 | 
| + | 
| +    smlad   lr, r9, r6, lr | 
| +    smlad   r11, r10, r6, r8 | 
| + | 
| +    ands    r10, r7, #0xff                  ; test loop counter | 
| + | 
| +    add     lr, lr, #0x40                   ; round_shift_and_clamp | 
| +    ldrneb  r8, [r0, #-2]                   ; load data for next loop | 
| +    usat    lr, #8, lr, asr #7 | 
| +    add     r11, r11, #0x40 | 
| +    ldrneb  r9, [r0, #-1] | 
| +    usat    r11, #8, r11, asr #7 | 
| + | 
| +    strh    lr, [r1], r12                   ; result is transposed and stored, which | 
| +                                            ; will make second pass filtering easier. | 
| +    ldrneb  r10, [r0], #2 | 
| +    strh    r11, [r1], r12 | 
| + | 
| +    bne     width_loop_1st_16_6 | 
| + | 
| +    ldr     r1, [sp]                        ; load and update dst address | 
| +    subs    r7, r7, #0x10000 | 
| +    add     r0, r0, r2                      ; move to next input line | 
| + | 
| +    add     r11, r2, #34                    ; adding back block width(=16) | 
| +    pld     [r0, r11]                       ; preload next low | 
| + | 
| +    add     r1, r1, #2                      ; move over to next column | 
| +    str     r1, [sp] | 
| + | 
| +    bne     height_loop_1st_16_6 | 
| + | 
| +    add     sp, sp, #4 | 
| +    ldmia   sp!, {r4 - r11, pc} | 
| + | 
| +    ENDP | 
| + | 
| +; -------------------------- | 
| +; 8x8 version | 
| +; ----------------------------- | 
| +|vp8_filter_block2d_first_pass_8x8_armv6| PROC | 
| +    stmdb   sp!, {r4 - r11, lr} | 
| + | 
| +    ldr     r11, [sp, #40]                  ; vp8_filter address | 
| +    ldr     r7, [sp, #36]                   ; output height | 
| + | 
| +    add     r4, r2, #10                     ; preload next low | 
| +    pld     [r0, r4] | 
| + | 
| +    sub     r2, r2, r3                      ; inside loop increments input array, | 
| +                                            ; so the height loop only needs to add | 
| +                                            ; r2 - width to the input pointer | 
| + | 
| +    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts | 
| +    add     r12, r3, #16                    ; square off the output | 
| +    sub     sp, sp, #4 | 
| + | 
| +    ldr     r4, [r11]                       ; load up packed filter coefficients | 
| +    ldr     r5, [r11, #4] | 
| +    ldr     r6, [r11, #8] | 
| + | 
| +    str     r1, [sp]                        ; push destination to stack | 
| +    mov     r7, r7, lsl #16                 ; height is top part of counter | 
| + | 
| +; six tap filter | 
| +|height_loop_1st_8_6| | 
| +    ldrb    r8, [r0, #-2]                   ; load source data | 
| +    ldrb    r9, [r0, #-1] | 
| +    ldrb    r10, [r0], #2 | 
| +    orr     r7, r7, r3, lsr #2              ; construct loop counter | 
| + | 
| +|width_loop_1st_8_6| | 
| +    ldrb    r11, [r0, #-1] | 
| + | 
| +    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8 | 
| +    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9 | 
| + | 
| +    ldrb    r9, [r0] | 
| + | 
| +    smuad   lr, lr, r4                      ; apply the filter | 
| +    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10 | 
| +    smuad   r8, r8, r4 | 
| +    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11 | 
| + | 
| +    smlad   lr, r10, r5, lr | 
| +    ldrb    r10, [r0, #1] | 
| +    smlad   r8, r11, r5, r8 | 
| +    ldrb    r11, [r0, #2] | 
| + | 
| +    sub     r7, r7, #1 | 
| + | 
| +    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9 | 
| +    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10 | 
| + | 
| +    smlad   lr, r9, r6, lr | 
| +    smlad   r11, r10, r6, r8 | 
| + | 
| +    ands    r10, r7, #0xff                  ; test loop counter | 
| + | 
| +    add     lr, lr, #0x40                   ; round_shift_and_clamp | 
| +    ldrneb  r8, [r0, #-2]                   ; load data for next loop | 
| +    usat    lr, #8, lr, asr #7 | 
| +    add     r11, r11, #0x40 | 
| +    ldrneb  r9, [r0, #-1] | 
| +    usat    r11, #8, r11, asr #7 | 
| + | 
| +    strh    lr, [r1], r12                   ; result is transposed and stored, which | 
| +                                            ; will make second pass filtering easier. | 
| +    ldrneb  r10, [r0], #2 | 
| +    strh    r11, [r1], r12 | 
| + | 
| +    bne     width_loop_1st_8_6 | 
| + | 
| +    ldr     r1, [sp]                        ; load and update dst address | 
| +    subs    r7, r7, #0x10000 | 
| +    add     r0, r0, r2                      ; move to next input line | 
| + | 
| +    add     r11, r2, #18                    ; adding back block width(=8) | 
| +    pld     [r0, r11]                       ; preload next low | 
| + | 
| +    add     r1, r1, #2                      ; move over to next column | 
| +    str     r1, [sp] | 
| + | 
| +    bne     height_loop_1st_8_6 | 
| + | 
| +    add     sp, sp, #4 | 
| +    ldmia   sp!, {r4 - r11, pc} | 
| + | 
| +    ENDP | 
| + | 
| ;--------------------------------- | 
| ; r0    short         *src_ptr, | 
| ; r1    unsigned char *output_ptr, | 
| @@ -262,6 +440,10 @@ | 
| |vp8_filter_block2d_first_pass_only_armv6| PROC | 
| stmdb   sp!, {r4 - r11, lr} | 
|  | 
| +    add     r7, r2, r3                      ; preload next low | 
| +    add     r7, r7, #2 | 
| +    pld     [r0, r7] | 
| + | 
| ldr     r4, [sp, #36]                   ; output pitch | 
| ldr     r11, [sp, #40]                  ; HFilter address | 
| sub     sp, sp, #8 | 
| @@ -330,16 +512,15 @@ | 
|  | 
| bne     width_loop_1st_only_6 | 
|  | 
| -    ;;add       r9, r2, #30                 ; attempt to load 2 adjacent cache lines | 
| -    ;;IF ARCHITECTURE=6 | 
| -    ;pld        [r0, r2] | 
| -    ;;pld       [r0, r9] | 
| -    ;;ENDIF | 
| - | 
| ldr     lr, [sp]                        ; load back output pitch | 
| ldr     r12, [sp, #4]                   ; load back output pitch | 
| subs    r7, r7, #1 | 
| add     r0, r0, r12                     ; updata src for next loop | 
| + | 
| +    add     r11, r12, r3                    ; preload next low | 
| +    add     r11, r11, #2 | 
| +    pld     [r0, r11] | 
| + | 
| add     r1, r1, lr                      ; update dst for next loop | 
|  | 
| bne     height_loop_1st_only_6 | 
|  |