source/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.asm - Issue 7671004: Update libvpx snapshot to v0.9.7-p1 (Cayuga).

Side by Side Diff: source/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.asm

Issue 7671004: Update libvpx snapshot to v0.9.7-p1 (Cayuga). (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/

Patch Set: '' Created 9 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« no previous file with comments | « source/libvpx/vp8/encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm ('k') | source/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 ;	1 ;

2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.	2 ; Copyright (c) 2011 The WebM project authors. All Rights Reserved.

3 ;	3 ;

4 ; Use of this source code is governed by a BSD-style license	4 ; Use of this source code is governed by a BSD-style license

5 ; that can be found in the LICENSE file in the root of the source	5 ; that can be found in the LICENSE file in the root of the source

6 ; tree. An additional intellectual property rights grant can be found	6 ; tree. An additional intellectual property rights grant can be found

7 ; in the file PATENTS. All contributing project authors may	7 ; in the file PATENTS. All contributing project authors may

8 ; be found in the AUTHORS file in the root of the source tree.	8 ; be found in the AUTHORS file in the root of the source tree.

9 ;	9 ;

10	10

11	11

12 EXPORT \|vp8_fast_quantize_b_neon_func\|	12 EXPORT \|vp8_fast_quantize_b_neon\|

	13 EXPORT \|vp8_fast_quantize_b_pair_neon\|

	14

	15 INCLUDE asm_enc_offsets.asm

13	16

14 ARM	17 ARM

15 REQUIRE8	18 REQUIRE8

16 PRESERVE8	19 PRESERVE8

17	20

18 AREA \|\|.text\|\|, CODE, READONLY, ALIGN=2	21 AREA \|\|.text\|\|, CODE, READONLY, ALIGN=4

19	22

20 ; r0 short *coeff_ptr	23 ;vp8_fast_quantize_b_pair_neon(BLOCK b1, BLOCK b2, BLOCKD d1, BLOCKD d2);

21 ; r1 short *zbin_ptr	24 \|vp8_fast_quantize_b_pair_neon\| PROC

22 ; r2 short *qcoeff_ptr	25

23 ; r3 short *dqcoeff_ptr	26 stmfd sp!, {r4-r9}

24 ; stack short *dequant_ptr	27 vstmdb sp!, {q4-q7}

25 ; stack short *scan_mask	28

26 ; stack short *round_ptr	29 ldr r4, [r0, #vp8_block_coeff]

27 ; stack short *quant_ptr	30 ldr r5, [r0, #vp8_block_quant_fast]

28	31 ldr r6, [r0, #vp8_block_round]

29 ; return int * eob	32

30 \|vp8_fast_quantize_b_neon_func\| PROC	33 vld1.16 {q0, q1}, [r4@128] ; load z

31 vld1.16 {q0, q1}, [r0] ;load z	34

32 vld1.16 {q10, q11}, [r1] ;load zbin	35 ldr r7, [r2, #vp8_blockd_qcoeff]

33	36

34 vabs.s16 q4, q0 ;calculate x = abs(z)	37 vabs.s16 q4, q0 ; calculate x = abs(z)

35 vabs.s16 q5, q1	38 vabs.s16 q5, q1

36	39

37 vcge.s16 q10, q4, q10 ;x>=zbin

38 vcge.s16 q11, q5, q11

39

40 ;if x<zbin (q10 & q11 are all 0), go to zero_output

41 vorr.s16 q6, q10, q11

42 vorr.s16 d12, d12, d13

43 vmov r0, r1, d12

44 orr r0, r0, r1

45 cmp r0, #0

46 beq zero_output

47

48 ldr r0, [sp, #8] ;load round_ptr

49 ldr r12, [sp, #12] ;load quant_ptr

50

51 ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negativ e	40 ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negativ e

52 vshr.s16 q2, q0, #15 ; sz	41 vshr.s16 q2, q0, #15 ; sz

53 vshr.s16 q3, q1, #15	42 vshr.s16 q3, q1, #15

54	43

55 vld1.s16 {q6, q7}, [r0] ;load round_ptr [0-15]	44 vld1.s16 {q6, q7}, [r6@128] ; load round_ptr [0-15]

56 vld1.s16 {q8, q9}, [r12] ;load quant_ptr [0-15]	45 vld1.s16 {q8, q9}, [r5@128] ; load quant_ptr [0-15]

57	46

58 vadd.s16 q4, q6 ;x + Round	47 ldr r4, [r1, #vp8_block_coeff]

	48

	49 vadd.s16 q4, q6 ; x + Round

59 vadd.s16 q5, q7	50 vadd.s16 q5, q7

60	51

61 ldr r0, [sp, #4] ;load rvsplus1_scan_order ptr	52 vld1.16 {q0, q1}, [r4@128] ; load z2

62	53

63 vqdmulh.s16 q4, q8 ;y = ((Round + abs(z)) * Quant) >> 16	54 vqdmulh.s16 q4, q8 ; y = ((Round+abs(z)) * Quant) >> 16

64 vqdmulh.s16 q5, q9	55 vqdmulh.s16 q5, q9

65	56

66 vld1.16 {q0, q1}, [r0] ;load rvsplus1_scan_order	57 vabs.s16 q10, q0 ; calculate x2 = abs(z_2)

67 vceq.s16 q8, q8 ;set q8 to all 1	58 vabs.s16 q11, q1

68	59 vshr.s16 q12, q0, #15 ; sz2

69 vshr.s16 q4, #1 ;right shift 1 after vqdmulh	60 vshr.s16 q13, q1, #15

	61

	62 ;modify data to have its original sign

	63 veor.s16 q4, q2 ; y^sz

	64 veor.s16 q5, q3

	65

	66 vadd.s16 q10, q6 ; x2 + Round

	67 vadd.s16 q11, q7

	68

	69 ldr r8, [r2, #vp8_blockd_dequant]

	70

	71 vqdmulh.s16 q10, q8 ; y2 = ((Round+abs(z)) * Quant) >> 16

	72 vqdmulh.s16 q11, q9

	73

	74 vshr.s16 q4, #1 ; right shift 1 after vqdmulh

70 vshr.s16 q5, #1	75 vshr.s16 q5, #1

71	76

	77 vld1.s16 {q6, q7}, [r8@128] ;load dequant_ptr[i]

	78

	79 vsub.s16 q4, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's comple ment)

	80 vsub.s16 q5, q3

	81

	82 vshr.s16 q10, #1 ; right shift 1 after vqdmulh

	83 vshr.s16 q11, #1

	84

	85 ldr r9, [r2, #vp8_blockd_dqcoeff]

	86

	87 veor.s16 q10, q12 ; y2^sz2

	88 veor.s16 q11, q13

	89

	90 vst1.s16 {q4, q5}, [r7] ; store: qcoeff = x1

	91

	92

	93 vsub.s16 q10, q12 ; x2=(y^sz)-sz = (y^sz)-(-1) (2's comple ment)

	94 vsub.s16 q11, q13

	95

	96 ldr r6, [r3, #vp8_blockd_qcoeff]

	97

	98 vmul.s16 q2, q6, q4 ; x * Dequant

	99 vmul.s16 q3, q7, q5

	100

	101 ldr r0, _inv_zig_zag_ ; load ptr of inverse zigzag table

	102

	103 vceq.s16 q8, q8 ; set q8 to all 1

	104

	105 vst1.s16 {q10, q11}, [r6] ; store: qcoeff = x2

	106

	107 vmul.s16 q12, q6, q10 ; x2 * Dequant

	108 vmul.s16 q13, q7, q11

	109

	110 vld1.16 {q6, q7}, [r0@128] ; load inverse scan order

	111

	112 vtst.16 q14, q4, q8 ; now find eob

	113 vtst.16 q15, q5, q8 ; non-zero element is set to all 1

	114

	115 vst1.s16 {q2, q3}, [r9] ; store dqcoeff = x * Dequant

	116

	117 ldr r7, [r3, #vp8_blockd_dqcoeff]

	118

	119 vand q0, q6, q14 ; get all valid numbers from scan array

	120 vand q1, q7, q15

	121

	122 vst1.s16 {q12, q13}, [r7] ; store dqcoeff = x * Dequant

	123

	124 vtst.16 q2, q10, q8 ; now find eob

	125 vtst.16 q3, q11, q8 ; non-zero element is set to all 1

	126

	127 vmax.u16 q0, q0, q1 ; find maximum value in q0, q1

	128

	129 vand q10, q6, q2 ; get all valid numbers from scan array

	130 vand q11, q7, q3

	131 vmax.u16 q10, q10, q11 ; find maximum value in q10, q11

	132

	133 vmax.u16 d0, d0, d1

	134 vmax.u16 d20, d20, d21

	135 vmovl.u16 q0, d0

	136 vmovl.u16 q10, d20

	137

	138

	139 vmax.u32 d0, d0, d1

	140 vmax.u32 d20, d20, d21

	141 vpmax.u32 d0, d0, d0

	142 vpmax.u32 d20, d20, d20

	143

	144 add r4, r2, #vp8_blockd_eob

	145 add r5, r3, #vp8_blockd_eob

	146

	147 vst1.32 {d0[0]}, [r4@32]

	148 vst1.32 {d20[0]}, [r5@32]

	149

	150 vldmia sp!, {q4-q7}

	151 ldmfd sp!, {r4-r9}

	152 bx lr

	153

	154 ENDP

	155

	156 ;void vp8_fast_quantize_b_c(BLOCK b, BLOCKD d)

	157 \|vp8_fast_quantize_b_neon\| PROC

	158

	159 stmfd sp!, {r4-r7}

	160

	161 ldr r3, [r0, #vp8_block_coeff]

	162 ldr r4, [r0, #vp8_block_quant_fast]

	163 ldr r5, [r0, #vp8_block_round]

	164

	165 vld1.16 {q0, q1}, [r3@128] ; load z

	166 vorr.s16 q14, q0, q1 ; check if all zero (step 1)

	167 ldr r6, [r1, #vp8_blockd_qcoeff]

	168 ldr r7, [r1, #vp8_blockd_dqcoeff]

	169 vorr.s16 d28, d28, d29 ; check if all zero (step 2)

	170

	171 vabs.s16 q12, q0 ; calculate x = abs(z)

	172 vabs.s16 q13, q1

	173

	174 ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negativ e

	175 vshr.s16 q2, q0, #15 ; sz

	176 vmov r2, r3, d28 ; check if all zero (step 3)

	177 vshr.s16 q3, q1, #15

	178

	179 vld1.s16 {q14, q15}, [r5@128]; load round_ptr [0-15]

	180 vld1.s16 {q8, q9}, [r4@128] ; load quant_ptr [0-15]

	181

	182 vadd.s16 q12, q14 ; x + Round

	183 vadd.s16 q13, q15

	184

	185 ldr r0, _inv_zig_zag_ ; load ptr of inverse zigzag table

	186

	187 vqdmulh.s16 q12, q8 ; y = ((Round+abs(z)) * Quant) >> 16

	188 vqdmulh.s16 q13, q9

	189

	190 vld1.16 {q10, q11}, [r0@128]; load inverse scan order

	191

	192 vceq.s16 q8, q8 ; set q8 to all 1

	193

	194 ldr r4, [r1, #vp8_blockd_dequant]

	195

	196 vshr.s16 q12, #1 ; right shift 1 after vqdmulh

	197 vshr.s16 q13, #1

	198

	199 orr r2, r2, r3 ; check if all zero (step 4)

	200 cmp r2, #0 ; check if all zero (step 5)

	201 beq zero_output ; check if all zero (step 6)

	202

72 ;modify data to have its original sign	203 ;modify data to have its original sign

73 veor.s16 q4, q2 ; y^sz	204 veor.s16 q12, q2 ; y^sz

74 veor.s16 q5, q3	205 veor.s16 q13, q3

75	206

76 ldr r12, [sp] ;load dequant_ptr	207 vsub.s16 q12, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's comple ment)

77	208 vsub.s16 q13, q3

78 vsub.s16 q4, q2 ; x1 = (y^sz) - sz = (y^sz) - (- 1) (two's complement)	209

79 vsub.s16 q5, q3	210 vld1.s16 {q2, q3}, [r4@128] ; load dequant_ptr[i]

80	211

81 vand.s16 q4, q10 ;mask off x1 elements	212 vtst.16 q14, q12, q8 ; now find eob

82 vand.s16 q5, q11	213 vtst.16 q15, q13, q8 ; non-zero element is set to all 1

83	214

84 vld1.s16 {q6, q7}, [r12] ;load dequant_ptr[i]	215 vst1.s16 {q12, q13}, [r6@128]; store: qcoeff = x1

85	216

86 vtst.16 q14, q4, q8 ;now find eob	217 vand q10, q10, q14 ; get all valid numbers from scan array

87 vtst.16 q15, q5, q8 ;non-zero element is set to all 1 in q4, q5	218 vand q11, q11, q15

88	219

89 vst1.s16 {q4, q5}, [r2] ;store: qcoeff = x1	220

90	221 vmax.u16 q0, q10, q11 ; find maximum value in q0, q1

91 vand q0, q0, q14 ;get all valid number from rvspl us1_scan_order array

92 vand q1, q1, q15

93

94 vmax.u16 q0, q0, q1 ;find maximum value in q0, q1

95 vmax.u16 d0, d0, d1	222 vmax.u16 d0, d0, d1

96 vmovl.u16 q0, d0	223 vmovl.u16 q0, d0

97	224

98 vmul.s16 q6, q4 ;x * Dequant	225 vmul.s16 q2, q12 ; x * Dequant

99 vmul.s16 q7, q5	226 vmul.s16 q3, q13

100	227

101 vmax.u32 d0, d0, d1	228 vmax.u32 d0, d0, d1

102 vpmax.u32 d0, d0, d0	229 vpmax.u32 d0, d0, d0

103	230

104 vst1.s16 {q6, q7}, [r3] ;store dqcoeff = x * Dequant	231 vst1.s16 {q2, q3}, [r7@128] ; store dqcoeff = x * Dequant

105	232

106 vmov.32 r0, d0[0]	233 add r4, r1, #vp8_blockd_eob

	234 vst1.32 {d0[0]}, [r4@32]

	235

	236 ldmfd sp!, {r4-r7}

107 bx lr	237 bx lr

108	238

109 zero_output	239 zero_output

110 vst1.s16 {q10, q11}, [r2] ; qcoeff = 0	240 str r2, [r1, #vp8_blockd_eob]

111 vst1.s16 {q10, q11}, [r3] ; dqcoeff = 0	241 vst1.s16 {q0, q1}, [r6@128] ; qcoeff = 0

112 mov r0, #0	242 vst1.s16 {q0, q1}, [r7@128] ; dqcoeff = 0

113	243

	244 ldmfd sp!, {r4-r7}

114 bx lr	245 bx lr

115	246

116 ENDP	247 ENDP

117	248

	249 ; default inverse zigzag table is defined in vp8/common/entropy.c

	250 _inv_zig_zag_

	251 DCD inv_zig_zag

	252

	253 ALIGN 16 ; enable use of @128 bit aligned loads

	254 inv_zig_zag

	255 DCW 0x0001, 0x0002, 0x0006, 0x0007

	256 DCW 0x0003, 0x0005, 0x0008, 0x000d

	257 DCW 0x0004, 0x0009, 0x000c, 0x000e

	258 DCW 0x000a, 0x000b, 0x000f, 0x0010

	259

118 END	260 END

	261

OLD	NEW