source/libvpx/vpx_dsp/arm/idct16x16_neon.c - Issue 1302353004: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vpx_dsp/arm/idct16x16_neon.c

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: Created 5 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #include "vp9/common/vp9_common.h"	11 #include "vpx_dsp/vpx_dsp_common.h"

12	12

13 void vp9_idct16x16_256_add_neon_pass1(const int16_t *input,	13 void vpx_idct16x16_256_add_neon_pass1(const int16_t *input,

14 int16_t *output,	14 int16_t *output,

15 int output_stride);	15 int output_stride);

16 void vp9_idct16x16_256_add_neon_pass2(const int16_t *src,	16 void vpx_idct16x16_256_add_neon_pass2(const int16_t *src,

17 int16_t *output,	17 int16_t *output,

18 int16_t *pass1Output,	18 int16_t *pass1Output,

19 int16_t skip_adding,	19 int16_t skip_adding,

20 uint8_t *dest,	20 uint8_t *dest,

21 int dest_stride);	21 int dest_stride);

22 void vp9_idct16x16_10_add_neon_pass1(const int16_t *input,	22 void vpx_idct16x16_10_add_neon_pass1(const int16_t *input,

23 int16_t *output,	23 int16_t *output,

24 int output_stride);	24 int output_stride);

25 void vp9_idct16x16_10_add_neon_pass2(const int16_t *src,	25 void vpx_idct16x16_10_add_neon_pass2(const int16_t *src,

26 int16_t *output,	26 int16_t *output,

27 int16_t *pass1Output,	27 int16_t *pass1Output,

28 int16_t skip_adding,	28 int16_t skip_adding,

29 uint8_t *dest,	29 uint8_t *dest,

30 int dest_stride);	30 int dest_stride);

31	31

32 #if HAVE_NEON_ASM	32 #if HAVE_NEON_ASM

33 /* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */	33 /* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */

34 extern void vp9_push_neon(int64_t *store);	34 extern void vpx_push_neon(int64_t *store);

35 extern void vp9_pop_neon(int64_t *store);	35 extern void vpx_pop_neon(int64_t *store);

36 #endif // HAVE_NEON_ASM	36 #endif // HAVE_NEON_ASM

37	37

38 void vp9_idct16x16_256_add_neon(const int16_t *input,	38 void vpx_idct16x16_256_add_neon(const int16_t *input,

39 uint8_t *dest, int dest_stride) {	39 uint8_t *dest, int dest_stride) {

40 #if HAVE_NEON_ASM	40 #if HAVE_NEON_ASM

41 int64_t store_reg[8];	41 int64_t store_reg[8];

42 #endif	42 #endif

43 int16_t pass1_output[16*16] = {0};	43 int16_t pass1_output[16*16] = {0};

44 int16_t row_idct_output[16*16] = {0};	44 int16_t row_idct_output[16*16] = {0};

45	45

46 #if HAVE_NEON_ASM	46 #if HAVE_NEON_ASM

47 // save d8-d15 register values.	47 // save d8-d15 register values.

48 vp9_push_neon(store_reg);	48 vpx_push_neon(store_reg);

49 #endif	49 #endif

50	50

51 /* Parallel idct on the upper 8 rows */	51 /* Parallel idct on the upper 8 rows */

52 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the	52 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the

53 // stage 6 result in pass1_output.	53 // stage 6 result in pass1_output.

54 vp9_idct16x16_256_add_neon_pass1(input, pass1_output, 8);	54 vpx_idct16x16_256_add_neon_pass1(input, pass1_output, 8);

55	55

56 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines	56 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines

57 // with result in pass1(pass1_output) to calculate final result in stage 7	57 // with result in pass1(pass1_output) to calculate final result in stage 7

58 // which will be saved into row_idct_output.	58 // which will be saved into row_idct_output.

59 vp9_idct16x16_256_add_neon_pass2(input+1,	59 vpx_idct16x16_256_add_neon_pass2(input+1,

60 row_idct_output,	60 row_idct_output,

61 pass1_output,	61 pass1_output,

62 0,	62 0,

63 dest,	63 dest,

64 dest_stride);	64 dest_stride);

65	65

66 /* Parallel idct on the lower 8 rows */	66 /* Parallel idct on the lower 8 rows */

67 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the	67 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the

68 // stage 6 result in pass1_output.	68 // stage 6 result in pass1_output.

69 vp9_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8);	69 vpx_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8);

70	70

71 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines	71 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines

72 // with result in pass1(pass1_output) to calculate final result in stage 7	72 // with result in pass1(pass1_output) to calculate final result in stage 7

73 // which will be saved into row_idct_output.	73 // which will be saved into row_idct_output.

74 vp9_idct16x16_256_add_neon_pass2(input+8*16+1,	74 vpx_idct16x16_256_add_neon_pass2(input+8*16+1,

75 row_idct_output+8,	75 row_idct_output+8,

76 pass1_output,	76 pass1_output,

77 0,	77 0,

78 dest,	78 dest,

79 dest_stride);	79 dest_stride);

80	80

81 /* Parallel idct on the left 8 columns */	81 /* Parallel idct on the left 8 columns */

82 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the	82 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the

83 // stage 6 result in pass1_output.	83 // stage 6 result in pass1_output.

84 vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);	84 vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);

85	85

86 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines	86 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines

87 // with result in pass1(pass1_output) to calculate final result in stage 7.	87 // with result in pass1(pass1_output) to calculate final result in stage 7.

88 // Then add the result to the destination data.	88 // Then add the result to the destination data.

89 vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,	89 vpx_idct16x16_256_add_neon_pass2(row_idct_output+1,

90 row_idct_output,	90 row_idct_output,

91 pass1_output,	91 pass1_output,

92 1,	92 1,

93 dest,	93 dest,

94 dest_stride);	94 dest_stride);

95	95

96 /* Parallel idct on the right 8 columns */	96 /* Parallel idct on the right 8 columns */

97 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the	97 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the

98 // stage 6 result in pass1_output.	98 // stage 6 result in pass1_output.

99 vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);	99 vpx_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);

100	100

101 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines	101 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines

102 // with result in pass1(pass1_output) to calculate final result in stage 7.	102 // with result in pass1(pass1_output) to calculate final result in stage 7.

103 // Then add the result to the destination data.	103 // Then add the result to the destination data.

104 vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,	104 vpx_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,

105 row_idct_output+8,	105 row_idct_output+8,

106 pass1_output,	106 pass1_output,

107 1,	107 1,

108 dest+8,	108 dest+8,

109 dest_stride);	109 dest_stride);

110	110

111 #if HAVE_NEON_ASM	111 #if HAVE_NEON_ASM

112 // restore d8-d15 register values.	112 // restore d8-d15 register values.

113 vp9_pop_neon(store_reg);	113 vpx_pop_neon(store_reg);

114 #endif	114 #endif

115	115

116 return;	116 return;

117 }	117 }

118	118

119 void vp9_idct16x16_10_add_neon(const int16_t *input,	119 void vpx_idct16x16_10_add_neon(const int16_t *input,

120 uint8_t *dest, int dest_stride) {	120 uint8_t *dest, int dest_stride) {

121 #if HAVE_NEON_ASM	121 #if HAVE_NEON_ASM

122 int64_t store_reg[8];	122 int64_t store_reg[8];

123 #endif	123 #endif

124 int16_t pass1_output[16*16] = {0};	124 int16_t pass1_output[16*16] = {0};

125 int16_t row_idct_output[16*16] = {0};	125 int16_t row_idct_output[16*16] = {0};

126	126

127 #if HAVE_NEON_ASM	127 #if HAVE_NEON_ASM

128 // save d8-d15 register values.	128 // save d8-d15 register values.

129 vp9_push_neon(store_reg);	129 vpx_push_neon(store_reg);

130 #endif	130 #endif

131	131

132 /* Parallel idct on the upper 8 rows */	132 /* Parallel idct on the upper 8 rows */

133 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the	133 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the

134 // stage 6 result in pass1_output.	134 // stage 6 result in pass1_output.

135 vp9_idct16x16_10_add_neon_pass1(input, pass1_output, 8);	135 vpx_idct16x16_10_add_neon_pass1(input, pass1_output, 8);

136	136

137 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines	137 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines

138 // with result in pass1(pass1_output) to calculate final result in stage 7	138 // with result in pass1(pass1_output) to calculate final result in stage 7

139 // which will be saved into row_idct_output.	139 // which will be saved into row_idct_output.

140 vp9_idct16x16_10_add_neon_pass2(input+1,	140 vpx_idct16x16_10_add_neon_pass2(input+1,

141 row_idct_output,	141 row_idct_output,

142 pass1_output,	142 pass1_output,

143 0,	143 0,

144 dest,	144 dest,

145 dest_stride);	145 dest_stride);

146	146

147 /* Skip Parallel idct on the lower 8 rows as they are all 0s */	147 /* Skip Parallel idct on the lower 8 rows as they are all 0s */

148	148

149 /* Parallel idct on the left 8 columns */	149 /* Parallel idct on the left 8 columns */

150 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the	150 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the

151 // stage 6 result in pass1_output.	151 // stage 6 result in pass1_output.

152 vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);	152 vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);

153	153

154 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines	154 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines

155 // with result in pass1(pass1_output) to calculate final result in stage 7.	155 // with result in pass1(pass1_output) to calculate final result in stage 7.

156 // Then add the result to the destination data.	156 // Then add the result to the destination data.

157 vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,	157 vpx_idct16x16_256_add_neon_pass2(row_idct_output+1,

158 row_idct_output,	158 row_idct_output,

159 pass1_output,	159 pass1_output,

160 1,	160 1,

161 dest,	161 dest,

162 dest_stride);	162 dest_stride);

163	163

164 /* Parallel idct on the right 8 columns */	164 /* Parallel idct on the right 8 columns */

165 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the	165 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the

166 // stage 6 result in pass1_output.	166 // stage 6 result in pass1_output.

167 vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);	167 vpx_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);

168	168

169 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines	169 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines

170 // with result in pass1(pass1_output) to calculate final result in stage 7.	170 // with result in pass1(pass1_output) to calculate final result in stage 7.

171 // Then add the result to the destination data.	171 // Then add the result to the destination data.

172 vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,	172 vpx_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,

173 row_idct_output+8,	173 row_idct_output+8,

174 pass1_output,	174 pass1_output,

175 1,	175 1,

176 dest+8,	176 dest+8,

177 dest_stride);	177 dest_stride);

178	178

179 #if HAVE_NEON_ASM	179 #if HAVE_NEON_ASM

180 // restore d8-d15 register values.	180 // restore d8-d15 register values.

181 vp9_pop_neon(store_reg);	181 vpx_pop_neon(store_reg);

182 #endif	182 #endif

183	183

184 return;	184 return;

185 }	185 }

OLD	NEW

« no previous file with comments | « source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c ('k') | source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.asm » ('j') | no next file with comments »