OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include "vp9/common/vp9_common.h" | 11 #include "vpx_dsp/vpx_dsp_common.h" |
12 | 12 |
13 void vp9_idct16x16_256_add_neon_pass1(const int16_t *input, | 13 void vpx_idct16x16_256_add_neon_pass1(const int16_t *input, |
14 int16_t *output, | 14 int16_t *output, |
15 int output_stride); | 15 int output_stride); |
16 void vp9_idct16x16_256_add_neon_pass2(const int16_t *src, | 16 void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, |
17 int16_t *output, | 17 int16_t *output, |
18 int16_t *pass1Output, | 18 int16_t *pass1Output, |
19 int16_t skip_adding, | 19 int16_t skip_adding, |
20 uint8_t *dest, | 20 uint8_t *dest, |
21 int dest_stride); | 21 int dest_stride); |
22 void vp9_idct16x16_10_add_neon_pass1(const int16_t *input, | 22 void vpx_idct16x16_10_add_neon_pass1(const int16_t *input, |
23 int16_t *output, | 23 int16_t *output, |
24 int output_stride); | 24 int output_stride); |
25 void vp9_idct16x16_10_add_neon_pass2(const int16_t *src, | 25 void vpx_idct16x16_10_add_neon_pass2(const int16_t *src, |
26 int16_t *output, | 26 int16_t *output, |
27 int16_t *pass1Output, | 27 int16_t *pass1Output, |
28 int16_t skip_adding, | 28 int16_t skip_adding, |
29 uint8_t *dest, | 29 uint8_t *dest, |
30 int dest_stride); | 30 int dest_stride); |
31 | 31 |
32 #if HAVE_NEON_ASM | 32 #if HAVE_NEON_ASM |
33 /* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */ | 33 /* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */ |
34 extern void vp9_push_neon(int64_t *store); | 34 extern void vpx_push_neon(int64_t *store); |
35 extern void vp9_pop_neon(int64_t *store); | 35 extern void vpx_pop_neon(int64_t *store); |
36 #endif // HAVE_NEON_ASM | 36 #endif // HAVE_NEON_ASM |
37 | 37 |
38 void vp9_idct16x16_256_add_neon(const int16_t *input, | 38 void vpx_idct16x16_256_add_neon(const int16_t *input, |
39 uint8_t *dest, int dest_stride) { | 39 uint8_t *dest, int dest_stride) { |
40 #if HAVE_NEON_ASM | 40 #if HAVE_NEON_ASM |
41 int64_t store_reg[8]; | 41 int64_t store_reg[8]; |
42 #endif | 42 #endif |
43 int16_t pass1_output[16*16] = {0}; | 43 int16_t pass1_output[16*16] = {0}; |
44 int16_t row_idct_output[16*16] = {0}; | 44 int16_t row_idct_output[16*16] = {0}; |
45 | 45 |
46 #if HAVE_NEON_ASM | 46 #if HAVE_NEON_ASM |
47 // save d8-d15 register values. | 47 // save d8-d15 register values. |
48 vp9_push_neon(store_reg); | 48 vpx_push_neon(store_reg); |
49 #endif | 49 #endif |
50 | 50 |
51 /* Parallel idct on the upper 8 rows */ | 51 /* Parallel idct on the upper 8 rows */ |
52 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the | 52 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the |
53 // stage 6 result in pass1_output. | 53 // stage 6 result in pass1_output. |
54 vp9_idct16x16_256_add_neon_pass1(input, pass1_output, 8); | 54 vpx_idct16x16_256_add_neon_pass1(input, pass1_output, 8); |
55 | 55 |
56 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines | 56 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines |
57 // with result in pass1(pass1_output) to calculate final result in stage 7 | 57 // with result in pass1(pass1_output) to calculate final result in stage 7 |
58 // which will be saved into row_idct_output. | 58 // which will be saved into row_idct_output. |
59 vp9_idct16x16_256_add_neon_pass2(input+1, | 59 vpx_idct16x16_256_add_neon_pass2(input+1, |
60 row_idct_output, | 60 row_idct_output, |
61 pass1_output, | 61 pass1_output, |
62 0, | 62 0, |
63 dest, | 63 dest, |
64 dest_stride); | 64 dest_stride); |
65 | 65 |
66 /* Parallel idct on the lower 8 rows */ | 66 /* Parallel idct on the lower 8 rows */ |
67 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the | 67 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the |
68 // stage 6 result in pass1_output. | 68 // stage 6 result in pass1_output. |
69 vp9_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8); | 69 vpx_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8); |
70 | 70 |
71 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines | 71 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines |
72 // with result in pass1(pass1_output) to calculate final result in stage 7 | 72 // with result in pass1(pass1_output) to calculate final result in stage 7 |
73 // which will be saved into row_idct_output. | 73 // which will be saved into row_idct_output. |
74 vp9_idct16x16_256_add_neon_pass2(input+8*16+1, | 74 vpx_idct16x16_256_add_neon_pass2(input+8*16+1, |
75 row_idct_output+8, | 75 row_idct_output+8, |
76 pass1_output, | 76 pass1_output, |
77 0, | 77 0, |
78 dest, | 78 dest, |
79 dest_stride); | 79 dest_stride); |
80 | 80 |
81 /* Parallel idct on the left 8 columns */ | 81 /* Parallel idct on the left 8 columns */ |
82 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the | 82 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the |
83 // stage 6 result in pass1_output. | 83 // stage 6 result in pass1_output. |
84 vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); | 84 vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); |
85 | 85 |
86 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines | 86 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines |
87 // with result in pass1(pass1_output) to calculate final result in stage 7. | 87 // with result in pass1(pass1_output) to calculate final result in stage 7. |
88 // Then add the result to the destination data. | 88 // Then add the result to the destination data. |
89 vp9_idct16x16_256_add_neon_pass2(row_idct_output+1, | 89 vpx_idct16x16_256_add_neon_pass2(row_idct_output+1, |
90 row_idct_output, | 90 row_idct_output, |
91 pass1_output, | 91 pass1_output, |
92 1, | 92 1, |
93 dest, | 93 dest, |
94 dest_stride); | 94 dest_stride); |
95 | 95 |
96 /* Parallel idct on the right 8 columns */ | 96 /* Parallel idct on the right 8 columns */ |
97 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the | 97 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the |
98 // stage 6 result in pass1_output. | 98 // stage 6 result in pass1_output. |
99 vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); | 99 vpx_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); |
100 | 100 |
101 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines | 101 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines |
102 // with result in pass1(pass1_output) to calculate final result in stage 7. | 102 // with result in pass1(pass1_output) to calculate final result in stage 7. |
103 // Then add the result to the destination data. | 103 // Then add the result to the destination data. |
104 vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1, | 104 vpx_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1, |
105 row_idct_output+8, | 105 row_idct_output+8, |
106 pass1_output, | 106 pass1_output, |
107 1, | 107 1, |
108 dest+8, | 108 dest+8, |
109 dest_stride); | 109 dest_stride); |
110 | 110 |
111 #if HAVE_NEON_ASM | 111 #if HAVE_NEON_ASM |
112 // restore d8-d15 register values. | 112 // restore d8-d15 register values. |
113 vp9_pop_neon(store_reg); | 113 vpx_pop_neon(store_reg); |
114 #endif | 114 #endif |
115 | 115 |
116 return; | 116 return; |
117 } | 117 } |
118 | 118 |
119 void vp9_idct16x16_10_add_neon(const int16_t *input, | 119 void vpx_idct16x16_10_add_neon(const int16_t *input, |
120 uint8_t *dest, int dest_stride) { | 120 uint8_t *dest, int dest_stride) { |
121 #if HAVE_NEON_ASM | 121 #if HAVE_NEON_ASM |
122 int64_t store_reg[8]; | 122 int64_t store_reg[8]; |
123 #endif | 123 #endif |
124 int16_t pass1_output[16*16] = {0}; | 124 int16_t pass1_output[16*16] = {0}; |
125 int16_t row_idct_output[16*16] = {0}; | 125 int16_t row_idct_output[16*16] = {0}; |
126 | 126 |
127 #if HAVE_NEON_ASM | 127 #if HAVE_NEON_ASM |
128 // save d8-d15 register values. | 128 // save d8-d15 register values. |
129 vp9_push_neon(store_reg); | 129 vpx_push_neon(store_reg); |
130 #endif | 130 #endif |
131 | 131 |
132 /* Parallel idct on the upper 8 rows */ | 132 /* Parallel idct on the upper 8 rows */ |
133 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the | 133 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the |
134 // stage 6 result in pass1_output. | 134 // stage 6 result in pass1_output. |
135 vp9_idct16x16_10_add_neon_pass1(input, pass1_output, 8); | 135 vpx_idct16x16_10_add_neon_pass1(input, pass1_output, 8); |
136 | 136 |
137 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines | 137 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines |
138 // with result in pass1(pass1_output) to calculate final result in stage 7 | 138 // with result in pass1(pass1_output) to calculate final result in stage 7 |
139 // which will be saved into row_idct_output. | 139 // which will be saved into row_idct_output. |
140 vp9_idct16x16_10_add_neon_pass2(input+1, | 140 vpx_idct16x16_10_add_neon_pass2(input+1, |
141 row_idct_output, | 141 row_idct_output, |
142 pass1_output, | 142 pass1_output, |
143 0, | 143 0, |
144 dest, | 144 dest, |
145 dest_stride); | 145 dest_stride); |
146 | 146 |
147 /* Skip Parallel idct on the lower 8 rows as they are all 0s */ | 147 /* Skip Parallel idct on the lower 8 rows as they are all 0s */ |
148 | 148 |
149 /* Parallel idct on the left 8 columns */ | 149 /* Parallel idct on the left 8 columns */ |
150 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the | 150 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the |
151 // stage 6 result in pass1_output. | 151 // stage 6 result in pass1_output. |
152 vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); | 152 vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); |
153 | 153 |
154 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines | 154 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines |
155 // with result in pass1(pass1_output) to calculate final result in stage 7. | 155 // with result in pass1(pass1_output) to calculate final result in stage 7. |
156 // Then add the result to the destination data. | 156 // Then add the result to the destination data. |
157 vp9_idct16x16_256_add_neon_pass2(row_idct_output+1, | 157 vpx_idct16x16_256_add_neon_pass2(row_idct_output+1, |
158 row_idct_output, | 158 row_idct_output, |
159 pass1_output, | 159 pass1_output, |
160 1, | 160 1, |
161 dest, | 161 dest, |
162 dest_stride); | 162 dest_stride); |
163 | 163 |
164 /* Parallel idct on the right 8 columns */ | 164 /* Parallel idct on the right 8 columns */ |
165 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the | 165 // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the |
166 // stage 6 result in pass1_output. | 166 // stage 6 result in pass1_output. |
167 vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); | 167 vpx_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); |
168 | 168 |
169 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines | 169 // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines |
170 // with result in pass1(pass1_output) to calculate final result in stage 7. | 170 // with result in pass1(pass1_output) to calculate final result in stage 7. |
171 // Then add the result to the destination data. | 171 // Then add the result to the destination data. |
172 vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1, | 172 vpx_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1, |
173 row_idct_output+8, | 173 row_idct_output+8, |
174 pass1_output, | 174 pass1_output, |
175 1, | 175 1, |
176 dest+8, | 176 dest+8, |
177 dest_stride); | 177 dest_stride); |
178 | 178 |
179 #if HAVE_NEON_ASM | 179 #if HAVE_NEON_ASM |
180 // restore d8-d15 register values. | 180 // restore d8-d15 register values. |
181 vp9_pop_neon(store_reg); | 181 vpx_pop_neon(store_reg); |
182 #endif | 182 #endif |
183 | 183 |
184 return; | 184 return; |
185 } | 185 } |
OLD | NEW |