| OLD | NEW | 
|---|
| 1 ; | 1 ; | 
| 2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ;  Copyright (c) 2011 The WebM project authors. All Rights Reserved. | 
| 3 ; | 3 ; | 
| 4 ;  Use of this source code is governed by a BSD-style license | 4 ;  Use of this source code is governed by a BSD-style license | 
| 5 ;  that can be found in the LICENSE file in the root of the source | 5 ;  that can be found in the LICENSE file in the root of the source | 
| 6 ;  tree. An additional intellectual property rights grant can be found | 6 ;  tree. An additional intellectual property rights grant can be found | 
| 7 ;  in the file PATENTS.  All contributing project authors may | 7 ;  in the file PATENTS.  All contributing project authors may | 
| 8 ;  be found in the AUTHORS file in the root of the source tree. | 8 ;  be found in the AUTHORS file in the root of the source tree. | 
| 9 ; | 9 ; | 
| 10 | 10 | 
| 11 | 11 | 
| 12     EXPORT  |vp8_fast_quantize_b_neon_func| | 12     EXPORT  |vp8_fast_quantize_b_neon| | 
|  | 13     EXPORT  |vp8_fast_quantize_b_pair_neon| | 
|  | 14 | 
|  | 15     INCLUDE asm_enc_offsets.asm | 
| 13 | 16 | 
| 14     ARM | 17     ARM | 
| 15     REQUIRE8 | 18     REQUIRE8 | 
| 16     PRESERVE8 | 19     PRESERVE8 | 
| 17 | 20 | 
| 18     AREA ||.text||, CODE, READONLY, ALIGN=2 | 21     AREA ||.text||, CODE, READONLY, ALIGN=4 | 
| 19 | 22 | 
| 20 ; r0        short *coeff_ptr | 23 ;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2); | 
| 21 ; r1        short *zbin_ptr | 24 |vp8_fast_quantize_b_pair_neon| PROC | 
| 22 ; r2        short *qcoeff_ptr | 25 | 
| 23 ; r3        short *dqcoeff_ptr | 26     stmfd           sp!, {r4-r9} | 
| 24 ; stack     short *dequant_ptr | 27     vstmdb          sp!, {q4-q7} | 
| 25 ; stack     short *scan_mask | 28 | 
| 26 ; stack     short *round_ptr | 29     ldr             r4, [r0, #vp8_block_coeff] | 
| 27 ; stack     short *quant_ptr | 30     ldr             r5, [r0, #vp8_block_quant_fast] | 
| 28 | 31     ldr             r6, [r0, #vp8_block_round] | 
| 29 ; return    int * eob | 32 | 
| 30 |vp8_fast_quantize_b_neon_func| PROC | 33     vld1.16         {q0, q1}, [r4@128]  ; load z | 
| 31     vld1.16         {q0, q1}, [r0]              ;load z | 34 | 
| 32     vld1.16         {q10, q11}, [r1]            ;load zbin | 35     ldr             r7, [r2, #vp8_blockd_qcoeff] | 
| 33 | 36 | 
| 34     vabs.s16        q4, q0                      ;calculate x = abs(z) | 37     vabs.s16        q4, q0              ; calculate x = abs(z) | 
| 35     vabs.s16        q5, q1 | 38     vabs.s16        q5, q1 | 
| 36 | 39 | 
| 37     vcge.s16        q10, q4, q10                ;x>=zbin |  | 
| 38     vcge.s16        q11, q5, q11 |  | 
| 39 |  | 
| 40     ;if x<zbin (q10 & q11 are all 0), go to zero_output |  | 
| 41     vorr.s16        q6, q10, q11 |  | 
| 42     vorr.s16        d12, d12, d13 |  | 
| 43     vmov            r0, r1, d12 |  | 
| 44     orr             r0, r0, r1 |  | 
| 45     cmp             r0, #0 |  | 
| 46     beq             zero_output |  | 
| 47 |  | 
| 48     ldr             r0, [sp, #8]                ;load round_ptr |  | 
| 49     ldr             r12, [sp, #12]              ;load quant_ptr |  | 
| 50 |  | 
| 51     ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negativ
     e | 40     ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negativ
     e | 
| 52     vshr.s16        q2, q0, #15                 ; sz | 41     vshr.s16        q2, q0, #15         ; sz | 
| 53     vshr.s16        q3, q1, #15 | 42     vshr.s16        q3, q1, #15 | 
| 54 | 43 | 
| 55     vld1.s16        {q6, q7}, [r0]              ;load round_ptr [0-15] | 44     vld1.s16        {q6, q7}, [r6@128]  ; load round_ptr [0-15] | 
| 56     vld1.s16        {q8, q9}, [r12]             ;load quant_ptr [0-15] | 45     vld1.s16        {q8, q9}, [r5@128]  ; load quant_ptr [0-15] | 
| 57 | 46 | 
| 58     vadd.s16        q4, q6                      ;x + Round | 47     ldr             r4, [r1, #vp8_block_coeff] | 
|  | 48 | 
|  | 49     vadd.s16        q4, q6              ; x + Round | 
| 59     vadd.s16        q5, q7 | 50     vadd.s16        q5, q7 | 
| 60 | 51 | 
| 61     ldr             r0, [sp, #4]                ;load rvsplus1_scan_order ptr | 52     vld1.16         {q0, q1}, [r4@128]  ; load z2 | 
| 62 | 53 | 
| 63     vqdmulh.s16     q4, q8                      ;y = ((Round + abs(z)) * Quant) 
     >> 16 | 54     vqdmulh.s16     q4, q8              ; y = ((Round+abs(z)) * Quant) >> 16 | 
| 64     vqdmulh.s16     q5, q9 | 55     vqdmulh.s16     q5, q9 | 
| 65 | 56 | 
| 66     vld1.16         {q0, q1}, [r0]              ;load rvsplus1_scan_order | 57     vabs.s16        q10, q0             ; calculate x2 = abs(z_2) | 
| 67     vceq.s16        q8, q8                      ;set q8 to all 1 | 58     vabs.s16        q11, q1 | 
| 68 | 59     vshr.s16        q12, q0, #15        ; sz2 | 
| 69     vshr.s16        q4, #1                      ;right shift 1 after vqdmulh | 60     vshr.s16        q13, q1, #15 | 
|  | 61 | 
|  | 62     ;modify data to have its original sign | 
|  | 63     veor.s16        q4, q2              ; y^sz | 
|  | 64     veor.s16        q5, q3 | 
|  | 65 | 
|  | 66     vadd.s16        q10, q6             ; x2 + Round | 
|  | 67     vadd.s16        q11, q7 | 
|  | 68 | 
|  | 69     ldr             r8, [r2, #vp8_blockd_dequant] | 
|  | 70 | 
|  | 71     vqdmulh.s16     q10, q8             ; y2 = ((Round+abs(z)) * Quant) >> 16 | 
|  | 72     vqdmulh.s16     q11, q9 | 
|  | 73 | 
|  | 74     vshr.s16        q4, #1              ; right shift 1 after vqdmulh | 
| 70     vshr.s16        q5, #1 | 75     vshr.s16        q5, #1 | 
| 71 | 76 | 
|  | 77     vld1.s16        {q6, q7}, [r8@128]  ;load dequant_ptr[i] | 
|  | 78 | 
|  | 79     vsub.s16        q4, q2              ; x1=(y^sz)-sz = (y^sz)-(-1) (2's comple
     ment) | 
|  | 80     vsub.s16        q5, q3 | 
|  | 81 | 
|  | 82     vshr.s16        q10, #1             ; right shift 1 after vqdmulh | 
|  | 83     vshr.s16        q11, #1 | 
|  | 84 | 
|  | 85     ldr             r9, [r2, #vp8_blockd_dqcoeff] | 
|  | 86 | 
|  | 87     veor.s16        q10, q12            ; y2^sz2 | 
|  | 88     veor.s16        q11, q13 | 
|  | 89 | 
|  | 90     vst1.s16        {q4, q5}, [r7]      ; store: qcoeff = x1 | 
|  | 91 | 
|  | 92 | 
|  | 93     vsub.s16        q10, q12            ; x2=(y^sz)-sz = (y^sz)-(-1) (2's comple
     ment) | 
|  | 94     vsub.s16        q11, q13 | 
|  | 95 | 
|  | 96     ldr             r6, [r3, #vp8_blockd_qcoeff] | 
|  | 97 | 
|  | 98     vmul.s16        q2, q6, q4          ; x * Dequant | 
|  | 99     vmul.s16        q3, q7, q5 | 
|  | 100 | 
|  | 101     ldr             r0, _inv_zig_zag_   ; load ptr of inverse zigzag table | 
|  | 102 | 
|  | 103     vceq.s16        q8, q8              ; set q8 to all 1 | 
|  | 104 | 
|  | 105     vst1.s16        {q10, q11}, [r6]    ; store: qcoeff = x2 | 
|  | 106 | 
|  | 107     vmul.s16        q12, q6, q10        ; x2 * Dequant | 
|  | 108     vmul.s16        q13, q7, q11 | 
|  | 109 | 
|  | 110     vld1.16         {q6, q7}, [r0@128]  ; load inverse scan order | 
|  | 111 | 
|  | 112     vtst.16         q14, q4, q8         ; now find eob | 
|  | 113     vtst.16         q15, q5, q8         ; non-zero element is set to all 1 | 
|  | 114 | 
|  | 115     vst1.s16        {q2, q3}, [r9]      ; store dqcoeff = x * Dequant | 
|  | 116 | 
|  | 117     ldr             r7, [r3, #vp8_blockd_dqcoeff] | 
|  | 118 | 
|  | 119     vand            q0, q6, q14         ; get all valid numbers from scan array | 
|  | 120     vand            q1, q7, q15 | 
|  | 121 | 
|  | 122     vst1.s16        {q12, q13}, [r7]    ; store dqcoeff = x * Dequant | 
|  | 123 | 
|  | 124     vtst.16         q2, q10, q8         ; now find eob | 
|  | 125     vtst.16         q3, q11, q8         ; non-zero element is set to all 1 | 
|  | 126 | 
|  | 127     vmax.u16        q0, q0, q1          ; find maximum value in q0, q1 | 
|  | 128 | 
|  | 129     vand            q10, q6, q2         ; get all valid numbers from scan array | 
|  | 130     vand            q11, q7, q3 | 
|  | 131     vmax.u16        q10, q10, q11       ; find maximum value in q10, q11 | 
|  | 132 | 
|  | 133     vmax.u16        d0, d0, d1 | 
|  | 134     vmax.u16        d20, d20, d21 | 
|  | 135     vmovl.u16       q0, d0 | 
|  | 136     vmovl.u16       q10, d20 | 
|  | 137 | 
|  | 138 | 
|  | 139     vmax.u32        d0, d0, d1 | 
|  | 140     vmax.u32        d20, d20, d21 | 
|  | 141     vpmax.u32       d0, d0, d0 | 
|  | 142     vpmax.u32       d20, d20, d20 | 
|  | 143 | 
|  | 144     add             r4, r2, #vp8_blockd_eob | 
|  | 145     add             r5, r3, #vp8_blockd_eob | 
|  | 146 | 
|  | 147     vst1.32         {d0[0]}, [r4@32] | 
|  | 148     vst1.32         {d20[0]}, [r5@32] | 
|  | 149 | 
|  | 150     vldmia          sp!, {q4-q7} | 
|  | 151     ldmfd           sp!, {r4-r9} | 
|  | 152     bx              lr | 
|  | 153 | 
|  | 154     ENDP | 
|  | 155 | 
|  | 156 ;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d) | 
|  | 157 |vp8_fast_quantize_b_neon| PROC | 
|  | 158 | 
|  | 159     stmfd           sp!, {r4-r7} | 
|  | 160 | 
|  | 161     ldr             r3, [r0, #vp8_block_coeff] | 
|  | 162     ldr             r4, [r0, #vp8_block_quant_fast] | 
|  | 163     ldr             r5, [r0, #vp8_block_round] | 
|  | 164 | 
|  | 165     vld1.16         {q0, q1}, [r3@128]  ; load z | 
|  | 166     vorr.s16        q14, q0, q1         ; check if all zero (step 1) | 
|  | 167     ldr             r6, [r1, #vp8_blockd_qcoeff] | 
|  | 168     ldr             r7, [r1, #vp8_blockd_dqcoeff] | 
|  | 169     vorr.s16        d28, d28, d29       ; check if all zero (step 2) | 
|  | 170 | 
|  | 171     vabs.s16        q12, q0             ; calculate x = abs(z) | 
|  | 172     vabs.s16        q13, q1 | 
|  | 173 | 
|  | 174     ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negativ
     e | 
|  | 175     vshr.s16        q2, q0, #15         ; sz | 
|  | 176     vmov            r2, r3, d28         ; check if all zero (step 3) | 
|  | 177     vshr.s16        q3, q1, #15 | 
|  | 178 | 
|  | 179     vld1.s16        {q14, q15}, [r5@128]; load round_ptr [0-15] | 
|  | 180     vld1.s16        {q8, q9}, [r4@128]  ; load quant_ptr [0-15] | 
|  | 181 | 
|  | 182     vadd.s16        q12, q14            ; x + Round | 
|  | 183     vadd.s16        q13, q15 | 
|  | 184 | 
|  | 185     ldr             r0, _inv_zig_zag_   ; load ptr of inverse zigzag table | 
|  | 186 | 
|  | 187     vqdmulh.s16     q12, q8             ; y = ((Round+abs(z)) * Quant) >> 16 | 
|  | 188     vqdmulh.s16     q13, q9 | 
|  | 189 | 
|  | 190     vld1.16         {q10, q11}, [r0@128]; load inverse scan order | 
|  | 191 | 
|  | 192     vceq.s16        q8, q8              ; set q8 to all 1 | 
|  | 193 | 
|  | 194     ldr             r4, [r1, #vp8_blockd_dequant] | 
|  | 195 | 
|  | 196     vshr.s16        q12, #1             ; right shift 1 after vqdmulh | 
|  | 197     vshr.s16        q13, #1 | 
|  | 198 | 
|  | 199     orr             r2, r2, r3          ; check if all zero (step 4) | 
|  | 200     cmp             r2, #0              ; check if all zero (step 5) | 
|  | 201     beq             zero_output         ; check if all zero (step 6) | 
|  | 202 | 
| 72     ;modify data to have its original sign | 203     ;modify data to have its original sign | 
| 73     veor.s16        q4, q2                      ; y^sz | 204     veor.s16        q12, q2             ; y^sz | 
| 74     veor.s16        q5, q3 | 205     veor.s16        q13, q3 | 
| 75 | 206 | 
| 76     ldr             r12, [sp]                   ;load dequant_ptr | 207     vsub.s16        q12, q2             ; x1=(y^sz)-sz = (y^sz)-(-1) (2's comple
     ment) | 
| 77 | 208     vsub.s16        q13, q3 | 
| 78     vsub.s16        q4, q2                      ; x1 = (y^sz) - sz = (y^sz) - (-
     1) (two's complement) | 209 | 
| 79     vsub.s16        q5, q3 | 210     vld1.s16        {q2, q3}, [r4@128]  ; load dequant_ptr[i] | 
| 80 | 211 | 
| 81     vand.s16        q4, q10                     ;mask off x1 elements | 212     vtst.16         q14, q12, q8        ; now find eob | 
| 82     vand.s16        q5, q11 | 213     vtst.16         q15, q13, q8        ; non-zero element is set to all 1 | 
| 83 | 214 | 
| 84     vld1.s16        {q6, q7}, [r12]             ;load dequant_ptr[i] | 215     vst1.s16        {q12, q13}, [r6@128]; store: qcoeff = x1 | 
| 85 | 216 | 
| 86     vtst.16         q14, q4, q8                 ;now find eob | 217     vand            q10, q10, q14       ; get all valid numbers from scan array | 
| 87     vtst.16         q15, q5, q8                 ;non-zero element is set to all 
     1 in q4, q5 | 218     vand            q11, q11, q15 | 
| 88 | 219 | 
| 89     vst1.s16        {q4, q5}, [r2]              ;store: qcoeff = x1 | 220 | 
| 90 | 221     vmax.u16        q0, q10, q11        ; find maximum value in q0, q1 | 
| 91     vand            q0, q0, q14                 ;get all valid number from rvspl
     us1_scan_order array |  | 
| 92     vand            q1, q1, q15 |  | 
| 93 |  | 
| 94     vmax.u16        q0, q0, q1                  ;find maximum value in q0, q1 |  | 
| 95     vmax.u16        d0, d0, d1 | 222     vmax.u16        d0, d0, d1 | 
| 96     vmovl.u16       q0, d0 | 223     vmovl.u16       q0, d0 | 
| 97 | 224 | 
| 98     vmul.s16        q6, q4                      ;x * Dequant | 225     vmul.s16        q2, q12             ; x * Dequant | 
| 99     vmul.s16        q7, q5 | 226     vmul.s16        q3, q13 | 
| 100 | 227 | 
| 101     vmax.u32        d0, d0, d1 | 228     vmax.u32        d0, d0, d1 | 
| 102     vpmax.u32       d0, d0, d0 | 229     vpmax.u32       d0, d0, d0 | 
| 103 | 230 | 
| 104     vst1.s16        {q6, q7}, [r3]              ;store dqcoeff = x * Dequant | 231     vst1.s16        {q2, q3}, [r7@128]  ; store dqcoeff = x * Dequant | 
| 105 | 232 | 
| 106     vmov.32         r0, d0[0] | 233     add             r4, r1, #vp8_blockd_eob | 
|  | 234     vst1.32         {d0[0]}, [r4@32] | 
|  | 235 | 
|  | 236     ldmfd           sp!, {r4-r7} | 
| 107     bx              lr | 237     bx              lr | 
| 108 | 238 | 
| 109 zero_output | 239 zero_output | 
| 110     vst1.s16        {q10, q11}, [r2]        ; qcoeff = 0 | 240     str             r2, [r1, #vp8_blockd_eob] | 
| 111     vst1.s16        {q10, q11}, [r3]        ; dqcoeff = 0 | 241     vst1.s16        {q0, q1}, [r6@128]  ; qcoeff = 0 | 
| 112     mov             r0, #0 | 242     vst1.s16        {q0, q1}, [r7@128]  ; dqcoeff = 0 | 
| 113 | 243 | 
|  | 244     ldmfd           sp!, {r4-r7} | 
| 114     bx              lr | 245     bx              lr | 
| 115 | 246 | 
| 116     ENDP | 247     ENDP | 
| 117 | 248 | 
|  | 249 ; default inverse zigzag table is defined in vp8/common/entropy.c | 
|  | 250 _inv_zig_zag_ | 
|  | 251     DCD inv_zig_zag | 
|  | 252 | 
|  | 253     ALIGN 16    ; enable use of @128 bit aligned loads | 
|  | 254 inv_zig_zag | 
|  | 255     DCW 0x0001, 0x0002, 0x0006, 0x0007 | 
|  | 256     DCW 0x0003, 0x0005, 0x0008, 0x000d | 
|  | 257     DCW 0x0004, 0x0009, 0x000c, 0x000e | 
|  | 258     DCW 0x000a, 0x000b, 0x000f, 0x0010 | 
|  | 259 | 
| 118     END | 260     END | 
|  | 261 | 
| OLD | NEW | 
|---|