Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(151)

Side by Side Diff: source/libvpx/vp9/encoder/vp9_dct.c

Issue 11555023: libvpx: Add VP9 decoder. (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 8 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
Property Changes:
Added: svn:eol-style
+ LF
OLDNEW
(Empty)
1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12 #include <assert.h>
13 #include <math.h>
14 #include "vpx_ports/config.h"
15 #include "vp9/common/vp9_systemdependent.h"
16
17 #include "vp9/common/vp9_blockd.h"
18
19 // TODO: these transforms can be converted into integer forms to reduce
20 // the complexity
21 static const float dct_4[16] = {
22 0.500000000000000, 0.500000000000000, 0.500000000000000, 0.500000000000000,
23 0.653281482438188, 0.270598050073099, -0.270598050073099, -0.653281482438188,
24 0.500000000000000, -0.500000000000000, -0.500000000000000, 0.500000000000000,
25 0.270598050073099, -0.653281482438188, 0.653281482438188, -0.270598050073099
26 };
27
28 static const float adst_4[16] = {
29 0.228013428883779, 0.428525073124360, 0.577350269189626, 0.656538502008139,
30 0.577350269189626, 0.577350269189626, 0.000000000000000, -0.577350269189626,
31 0.656538502008139, -0.228013428883779, -0.577350269189626, 0.428525073124359,
32 0.428525073124360, -0.656538502008139, 0.577350269189626, -0.228013428883779
33 };
34
35 static const float dct_8[64] = {
36 0.353553390593274, 0.353553390593274, 0.353553390593274, 0.3535533905932 74,
37 0.353553390593274, 0.353553390593274, 0.353553390593274, 0.3535533905932 74,
38 0.490392640201615, 0.415734806151273, 0.277785116509801, 0.0975451610080 64,
39 -0.097545161008064, -0.277785116509801, -0.415734806151273, -0.4903926402016 15,
40 0.461939766255643, 0.191341716182545, -0.191341716182545, -0.4619397662556 43,
41 -0.461939766255643, -0.191341716182545, 0.191341716182545, 0.4619397662556 43,
42 0.415734806151273, -0.097545161008064, -0.490392640201615, -0.2777851165098 01,
43 0.277785116509801, 0.490392640201615, 0.097545161008064, -0.4157348061512 73,
44 0.353553390593274, -0.353553390593274, -0.353553390593274, 0.3535533905932 74,
45 0.353553390593274, -0.353553390593274, -0.353553390593274, 0.3535533905932 74,
46 0.277785116509801, -0.490392640201615, 0.097545161008064, 0.4157348061512 73,
47 -0.415734806151273, -0.097545161008064, 0.490392640201615, -0.2777851165098 01,
48 0.191341716182545, -0.461939766255643, 0.461939766255643, -0.1913417161825 45,
49 -0.191341716182545, 0.461939766255643, -0.461939766255643, 0.1913417161825 45,
50 0.097545161008064, -0.277785116509801, 0.415734806151273, -0.4903926402016 15,
51 0.490392640201615, -0.415734806151273, 0.277785116509801, -0.0975451610080 64
52 };
53
54 static const float adst_8[64] = {
55 0.089131608307533, 0.175227946595735, 0.255357107325376, 0.3267903880321 45,
56 0.387095214016349, 0.434217976756762, 0.466553967085785, 0.4830020216355 09,
57 0.255357107325376, 0.434217976756762, 0.483002021635509, 0.3870952140163 49,
58 0.175227946595735, -0.089131608307533, -0.326790388032145, -0.4665539670857 85,
59 0.387095214016349, 0.466553967085785, 0.175227946595735, -0.2553571073253 76,
60 -0.483002021635509, -0.326790388032145, 0.089131608307533, 0.4342179767567 62,
61 0.466553967085785, 0.255357107325376, -0.326790388032145, -0.4342179767567 62,
62 0.089131608307533, 0.483002021635509, 0.175227946595735, -0.3870952140163 48,
63 0.483002021635509, -0.089131608307533, -0.466553967085785, 0.1752279465957 35,
64 0.434217976756762, -0.255357107325376, -0.387095214016348, 0.3267903880321 45,
65 0.434217976756762, -0.387095214016348, -0.089131608307533, 0.4665539670857 86,
66 -0.326790388032145, -0.175227946595735, 0.483002021635509, -0.2553571073253 75,
67 0.326790388032145, -0.483002021635509, 0.387095214016349, -0.0891316083075 34,
68 -0.255357107325377, 0.466553967085785, -0.434217976756762, 0.1752279465957 36,
69 0.175227946595735, -0.326790388032145, 0.434217976756762, -0.4830020216355 09,
70 0.466553967085785, -0.387095214016348, 0.255357107325376, -0.0891316083075 32
71 };
72
73 /* Converted the transforms to integers. */
74 static const int16_t dct_i4[16] = {
75 16384, 16384, 16384, 16384,
76 21407, 8867, -8867, -21407,
77 16384, -16384, -16384, 16384,
78 8867, -21407, 21407, -8867
79 };
80
81 static const int16_t adst_i4[16] = {
82 7472, 14042, 18919, 21513,
83 18919, 18919, 0, -18919,
84 21513, -7472, -18919, 14042,
85 14042, -21513, 18919, -7472
86 };
87
88 static const int16_t dct_i8[64] = {
89 11585, 11585, 11585, 11585,
90 11585, 11585, 11585, 11585,
91 16069, 13623, 9102, 3196,
92 -3196, -9102, -13623, -16069,
93 15137, 6270, -6270, -15137,
94 -15137, -6270, 6270, 15137,
95 13623, -3196, -16069, -9102,
96 9102, 16069, 3196, -13623,
97 11585, -11585, -11585, 11585,
98 11585, -11585, -11585, 11585,
99 9102, -16069, 3196, 13623,
100 -13623, -3196, 16069, -9102,
101 6270, -15137, 15137, -6270,
102 -6270, 15137, -15137, 6270,
103 3196, -9102, 13623, -16069,
104 16069, -13623, 9102, -3196
105 };
106
107 static const int16_t adst_i8[64] = {
108 2921, 5742, 8368, 10708,
109 12684, 14228, 15288, 15827,
110 8368, 14228, 15827, 12684,
111 5742, -2921, -10708, -15288,
112 12684, 15288, 5742, -8368,
113 -15827, -10708, 2921, 14228,
114 15288, 8368, -10708, -14228,
115 2921, 15827, 5742, -12684,
116 15827, -2921, -15288, 5742,
117 14228, -8368, -12684, 10708,
118 14228, -12684, -2921, 15288,
119 -10708, -5742, 15827, -8368,
120 10708, -15827, 12684, -2921,
121 -8368, 15288, -14228, 5742,
122 5742, -10708, 14228, -15827,
123 15288, -12684, 8368, -2921
124 };
125
126 static const float dct_16[256] = {
127 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0 .250000,
128 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0 .250000,
129 0.351851, 0.338330, 0.311806, 0.273300, 0.224292, 0.166664, 0.102631, 0 .034654,
130 -0.034654, -0.102631, -0.166664, -0.224292, -0.273300, -0.311806, -0.338330, -0 .351851,
131 0.346760, 0.293969, 0.196424, 0.068975, -0.068975, -0.196424, -0.293969, -0 .346760,
132 -0.346760, -0.293969, -0.196424, -0.068975, 0.068975, 0.196424, 0.293969, 0 .346760,
133 0.338330, 0.224292, 0.034654, -0.166664, -0.311806, -0.351851, -0.273300, -0 .102631,
134 0.102631, 0.273300, 0.351851, 0.311806, 0.166664, -0.034654, -0.224292, -0 .338330,
135 0.326641, 0.135299, -0.135299, -0.326641, -0.326641, -0.135299, 0.135299, 0 .326641,
136 0.326641, 0.135299, -0.135299, -0.326641, -0.326641, -0.135299, 0.135299, 0 .326641,
137 0.311806, 0.034654, -0.273300, -0.338330, -0.102631, 0.224292, 0.351851, 0 .166664,
138 -0.166664, -0.351851, -0.224292, 0.102631, 0.338330, 0.273300, -0.034654, -0 .311806,
139 0.293969, -0.068975, -0.346760, -0.196424, 0.196424, 0.346760, 0.068975, -0 .293969,
140 -0.293969, 0.068975, 0.346760, 0.196424, -0.196424, -0.346760, -0.068975, 0 .293969,
141 0.273300, -0.166664, -0.338330, 0.034654, 0.351851, 0.102631, -0.311806, -0 .224292,
142 0.224292, 0.311806, -0.102631, -0.351851, -0.034654, 0.338330, 0.166664, -0 .273300,
143 0.250000, -0.250000, -0.250000, 0.250000, 0.250000, -0.250000, -0.250000, 0 .250000,
144 0.250000, -0.250000, -0.250000, 0.250000, 0.250000, -0.250000, -0.250000, 0 .250000,
145 0.224292, -0.311806, -0.102631, 0.351851, -0.034654, -0.338330, 0.166664, 0 .273300,
146 -0.273300, -0.166664, 0.338330, 0.034654, -0.351851, 0.102631, 0.311806, -0 .224292,
147 0.196424, -0.346760, 0.068975, 0.293969, -0.293969, -0.068975, 0.346760, -0 .196424,
148 -0.196424, 0.346760, -0.068975, -0.293969, 0.293969, 0.068975, -0.346760, 0 .196424,
149 0.166664, -0.351851, 0.224292, 0.102631, -0.338330, 0.273300, 0.034654, -0 .311806,
150 0.311806, -0.034654, -0.273300, 0.338330, -0.102631, -0.224292, 0.351851, -0 .166664,
151 0.135299, -0.326641, 0.326641, -0.135299, -0.135299, 0.326641, -0.326641, 0 .135299,
152 0.135299, -0.326641, 0.326641, -0.135299, -0.135299, 0.326641, -0.326641, 0 .135299,
153 0.102631, -0.273300, 0.351851, -0.311806, 0.166664, 0.034654, -0.224292, 0 .338330,
154 -0.338330, 0.224292, -0.034654, -0.166664, 0.311806, -0.351851, 0.273300, -0 .102631,
155 0.068975, -0.196424, 0.293969, -0.346760, 0.346760, -0.293969, 0.196424, -0 .068975,
156 -0.068975, 0.196424, -0.293969, 0.346760, -0.346760, 0.293969, -0.196424, 0 .068975,
157 0.034654, -0.102631, 0.166664, -0.224292, 0.273300, -0.311806, 0.338330, -0 .351851,
158 0.351851, -0.338330, 0.311806, -0.273300, 0.224292, -0.166664, 0.102631, -0 .034654
159 };
160
161 static const float adst_16[256] = {
162 0.033094, 0.065889, 0.098087, 0.129396, 0.159534, 0.188227, 0.215215, 0 .240255,
163 0.263118, 0.283599, 0.301511, 0.316693, 0.329007, 0.338341, 0.344612, 0 .347761,
164 0.098087, 0.188227, 0.263118, 0.316693, 0.344612, 0.344612, 0.316693, 0 .263118,
165 0.188227, 0.098087, 0.000000, -0.098087, -0.188227, -0.263118, -0.316693, -0 .344612,
166 0.159534, 0.283599, 0.344612, 0.329007, 0.240255, 0.098087, -0.065889, -0 .215215,
167 -0.316693, -0.347761, -0.301511, -0.188227, -0.033094, 0.129396, 0.263118, 0 .338341,
168 0.215215, 0.338341, 0.316693, 0.159534, -0.065889, -0.263118, -0.347761, -0 .283599,
169 -0.098087, 0.129396, 0.301511, 0.344612, 0.240255, 0.033094, -0.188227, -0 .329007,
170 0.263118, 0.344612, 0.188227, -0.098087, -0.316693, -0.316693, -0.098087, 0 .188227,
171 0.344612, 0.263118, 0.000000, -0.263118, -0.344612, -0.188227, 0.098087, 0 .316693,
172 0.301511, 0.301511, 0.000000, -0.301511, -0.301511, -0.000000, 0.301511, 0 .301511,
173 0.000000, -0.301511, -0.301511, -0.000000, 0.301511, 0.301511, 0.000000, -0 .301511,
174 0.329007, 0.215215, -0.188227, -0.338341, -0.033094, 0.316693, 0.240255, -0 .159534,
175 -0.344612, -0.065889, 0.301511, 0.263118, -0.129396, -0.347761, -0.098087, 0 .283599,
176 0.344612, 0.098087, -0.316693, -0.188227, 0.263118, 0.263118, -0.188227, -0 .316693,
177 0.098087, 0.344612, 0.000000, -0.344612, -0.098087, 0.316693, 0.188227, -0 .263118,
178 0.347761, -0.033094, -0.344612, 0.065889, 0.338341, -0.098087, -0.329007, 0 .129396,
179 0.316693, -0.159534, -0.301511, 0.188227, 0.283599, -0.215215, -0.263118, 0 .240255,
180 0.338341, -0.159534, -0.263118, 0.283599, 0.129396, -0.344612, 0.033094, 0 .329007,
181 -0.188227, -0.240255, 0.301511, 0.098087, -0.347761, 0.065889, 0.316693, -0 .215215,
182 0.316693, -0.263118, -0.098087, 0.344612, -0.188227, -0.188227, 0.344612, -0 .098087,
183 -0.263118, 0.316693, 0.000000, -0.316693, 0.263118, 0.098087, -0.344612, 0 .188227,
184 0.283599, -0.329007, 0.098087, 0.215215, -0.347761, 0.188227, 0.129396, -0 .338341,
185 0.263118, 0.033094, -0.301511, 0.316693, -0.065889, -0.240255, 0.344612, -0 .159534,
186 0.240255, -0.347761, 0.263118, -0.033094, -0.215215, 0.344612, -0.283599, 0 .065889,
187 0.188227, -0.338341, 0.301511, -0.098087, -0.159534, 0.329007, -0.316693, 0 .129396,
188 0.188227, -0.316693, 0.344612, -0.263118, 0.098087, 0.098087, -0.263118, 0 .344612,
189 -0.316693, 0.188227, 0.000000, -0.188227, 0.316693, -0.344612, 0.263118, -0 .098087,
190 0.129396, -0.240255, 0.316693, -0.347761, 0.329007, -0.263118, 0.159534, -0 .033094,
191 -0.098087, 0.215215, -0.301511, 0.344612, -0.338341, 0.283599, -0.188227, 0 .065889,
192 0.065889, -0.129396, 0.188227, -0.240255, 0.283599, -0.316693, 0.338341, -0 .347761,
193 0.344612, -0.329007, 0.301511, -0.263118, 0.215215, -0.159534, 0.098087, -0 .033094
194 };
195
196 /* Converted the transforms to integers. */
197 static const int16_t dct_i16[256] = {
198 8192, 8192, 8192, 8192, 8192, 8192, 8192, 8192,
199 8192, 8192, 8192, 8192, 8192, 8192, 8192, 8192,
200 11529, 11086, 10217, 8955, 7350, 5461, 3363, 1136,
201 -1136, -3363, -5461, -7350, -8955, -10217, -11086, -11529,
202 11363, 9633, 6436, 2260, -2260, -6436, -9633, -11363,
203 -11363, -9633, -6436, -2260, 2260, 6436, 9633, 11363,
204 11086, 7350, 1136, -5461, -10217, -11529, -8955, -3363,
205 3363, 8955, 11529, 10217, 5461, -1136, -7350, -11086,
206 10703, 4433, -4433, -10703, -10703, -4433, 4433, 10703,
207 10703, 4433, -4433, -10703, -10703, -4433, 4433, 10703,
208 10217, 1136, -8955, -11086, -3363, 7350, 11529, 5461,
209 -5461, -11529, -7350, 3363, 11086, 8955, -1136, -10217,
210 9633, -2260, -11363, -6436, 6436, 11363, 2260, -9633,
211 -9633, 2260, 11363, 6436, -6436, -11363, -2260, 9633,
212 8955, -5461, -11086, 1136, 11529, 3363, -10217, -7350,
213 7350, 10217, -3363, -11529, -1136, 11086, 5461, -8955,
214 8192, -8192, -8192, 8192, 8192, -8192, -8192, 8192,
215 8192, -8192, -8192, 8192, 8192, -8192, -8192, 8192,
216 7350, -10217, -3363, 11529, -1136, -11086, 5461, 8955,
217 -8955, -5461, 11086, 1136, -11529, 3363, 10217, -7350,
218 6436, -11363, 2260, 9633, -9633, -2260, 11363, -6436,
219 -6436, 11363, -2260, -9633, 9633, 2260, -11363, 6436,
220 5461, -11529, 7350, 3363, -11086, 8955, 1136, -10217,
221 10217, -1136, -8955, 11086, -3363, -7350, 11529, -5461,
222 4433, -10703, 10703, -4433, -4433, 10703, -10703, 4433,
223 4433, -10703, 10703, -4433, -4433, 10703, -10703, 4433,
224 3363, -8955, 11529, -10217, 5461, 1136, -7350, 11086,
225 -11086, 7350, -1136, -5461, 10217, -11529, 8955, -3363,
226 2260, -6436, 9633, -11363, 11363, -9633, 6436, -2260,
227 -2260, 6436, -9633, 11363, -11363, 9633, -6436, 2260,
228 1136, -3363, 5461, -7350, 8955, -10217, 11086, -11529,
229 11529, -11086, 10217, -8955, 7350, -5461, 3363, -1136
230 };
231
232 static const int16_t adst_i16[256] = {
233 1084, 2159, 3214, 4240, 5228, 6168, 7052, 7873,
234 8622, 9293, 9880, 10377, 10781, 11087, 11292, 11395,
235 3214, 6168, 8622, 10377, 11292, 11292, 10377, 8622,
236 6168, 3214, 0, -3214, -6168, -8622, -10377, -11292,
237 5228, 9293, 11292, 10781, 7873, 3214, -2159, -7052,
238 -10377, -11395, -9880, -6168, -1084, 4240, 8622, 11087,
239 7052, 11087, 10377, 5228, -2159, -8622, -11395, -9293,
240 -3214, 4240, 9880, 11292, 7873, 1084, -6168, -10781,
241 8622, 11292, 6168, -3214, -10377, -10377, -3214, 6168,
242 11292, 8622, 0, -8622, -11292, -6168, 3214, 10377,
243 9880, 9880, 0, -9880, -9880, 0, 9880, 9880,
244 0, -9880, -9880, 0, 9880, 9880, 0, -9880,
245 10781, 7052, -6168, -11087, -1084, 10377, 7873, -5228,
246 -11292, -2159, 9880, 8622, -4240, -11395, -3214, 9293,
247 11292, 3214, -10377, -6168, 8622, 8622, -6168, -10377,
248 3214, 11292, 0, -11292, -3214, 10377, 6168, -8622,
249 11395, -1084, -11292, 2159, 11087, -3214, -10781, 4240,
250 10377, -5228, -9880, 6168, 9293, -7052, -8622, 7873,
251 11087, -5228, -8622, 9293, 4240, -11292, 1084, 10781,
252 -6168, -7873, 9880, 3214, -11395, 2159, 10377, -7052,
253 10377, -8622, -3214, 11292, -6168, -6168, 11292, -3214,
254 -8622, 10377, 0, -10377, 8622, 3214, -11292, 6168,
255 9293, -10781, 3214, 7052, -11395, 6168, 4240, -11087,
256 8622, 1084, -9880, 10377, -2159, -7873, 11292, -5228,
257 7873, -11395, 8622, -1084, -7052, 11292, -9293, 2159,
258 6168, -11087, 9880, -3214, -5228, 10781, -10377, 4240,
259 6168, -10377, 11292, -8622, 3214, 3214, -8622, 11292,
260 -10377, 6168, 0, -6168, 10377, -11292, 8622, -3214,
261 4240, -7873, 10377, -11395, 10781, -8622, 5228, -1084,
262 -3214, 7052, -9880, 11292, -11087, 9293, -6168, 2159,
263 2159, -4240, 6168, -7873, 9293, -10377, 11087, -11395,
264 11292, -10781, 9880, -8622, 7052, -5228, 3214, -1084
265 };
266
267 static const int xC1S7 = 16069;
268 static const int xC2S6 = 15137;
269 static const int xC3S5 = 13623;
270 static const int xC4S4 = 11585;
271 static const int xC5S3 = 9102;
272 static const int xC6S2 = 6270;
273 static const int xC7S1 = 3196;
274
275 #define SHIFT_BITS 14
276 #define DOROUND(X) X += (1<<(SHIFT_BITS-1));
277
278 #define FINAL_SHIFT 3
279 #define FINAL_ROUNDING (1<<(FINAL_SHIFT -1))
280 #define IN_SHIFT (FINAL_SHIFT+1)
281
282
283 void vp9_short_fdct8x8_c(short *InputData, short *OutputData, int pitch) {
284 int loop;
285 int short_pitch = pitch >> 1;
286 int is07, is12, is34, is56;
287 int is0734, is1256;
288 int id07, id12, id34, id56;
289 int irot_input_x, irot_input_y;
290 int icommon_product1; // Re-used product (c4s4 * (s12 - s56))
291 int icommon_product2; // Re-used product (c4s4 * (d12 + d56))
292 int temp1, temp2; // intermediate variable for computation
293
294 int InterData[64];
295 int *ip = InterData;
296 short *op = OutputData;
297
298 for (loop = 0; loop < 8; loop++) {
299 // Pre calculate some common sums and differences.
300 is07 = (InputData[0] + InputData[7]) << IN_SHIFT;
301 is12 = (InputData[1] + InputData[2]) << IN_SHIFT;
302 is34 = (InputData[3] + InputData[4]) << IN_SHIFT;
303 is56 = (InputData[5] + InputData[6]) << IN_SHIFT;
304 id07 = (InputData[0] - InputData[7]) << IN_SHIFT;
305 id12 = (InputData[1] - InputData[2]) << IN_SHIFT;
306 id34 = (InputData[3] - InputData[4]) << IN_SHIFT;
307 id56 = (InputData[5] - InputData[6]) << IN_SHIFT;
308
309 is0734 = is07 + is34;
310 is1256 = is12 + is56;
311
312 // Pre-Calculate some common product terms.
313 icommon_product1 = xC4S4 * (is12 - is56);
314 DOROUND(icommon_product1)
315 icommon_product1 >>= SHIFT_BITS;
316
317 icommon_product2 = xC4S4 * (id12 + id56);
318 DOROUND(icommon_product2)
319 icommon_product2 >>= SHIFT_BITS;
320
321
322 ip[0] = (xC4S4 * (is0734 + is1256));
323 DOROUND(ip[0]);
324 ip[0] >>= SHIFT_BITS;
325
326 ip[4] = (xC4S4 * (is0734 - is1256));
327 DOROUND(ip[4]);
328 ip[4] >>= SHIFT_BITS;
329
330 // Define inputs to rotation for outputs 2 and 6
331 irot_input_x = id12 - id56;
332 irot_input_y = is07 - is34;
333
334 // Apply rotation for outputs 2 and 6.
335 temp1 = xC6S2 * irot_input_x;
336 DOROUND(temp1);
337 temp1 >>= SHIFT_BITS;
338 temp2 = xC2S6 * irot_input_y;
339 DOROUND(temp2);
340 temp2 >>= SHIFT_BITS;
341 ip[2] = temp1 + temp2;
342
343 temp1 = xC6S2 * irot_input_y;
344 DOROUND(temp1);
345 temp1 >>= SHIFT_BITS;
346 temp2 = xC2S6 * irot_input_x;
347 DOROUND(temp2);
348 temp2 >>= SHIFT_BITS;
349 ip[6] = temp1 - temp2;
350
351 // Define inputs to rotation for outputs 1 and 7
352 irot_input_x = icommon_product1 + id07;
353 irot_input_y = -(id34 + icommon_product2);
354
355 // Apply rotation for outputs 1 and 7.
356 temp1 = xC1S7 * irot_input_x;
357 DOROUND(temp1);
358 temp1 >>= SHIFT_BITS;
359 temp2 = xC7S1 * irot_input_y;
360 DOROUND(temp2);
361 temp2 >>= SHIFT_BITS;
362 ip[1] = temp1 - temp2;
363
364 temp1 = xC7S1 * irot_input_x;
365 DOROUND(temp1);
366 temp1 >>= SHIFT_BITS;
367 temp2 = xC1S7 * irot_input_y;
368 DOROUND(temp2);
369 temp2 >>= SHIFT_BITS;
370 ip[7] = temp1 + temp2;
371
372 // Define inputs to rotation for outputs 3 and 5
373 irot_input_x = id07 - icommon_product1;
374 irot_input_y = id34 - icommon_product2;
375
376 // Apply rotation for outputs 3 and 5.
377 temp1 = xC3S5 * irot_input_x;
378 DOROUND(temp1);
379 temp1 >>= SHIFT_BITS;
380 temp2 = xC5S3 * irot_input_y;
381 DOROUND(temp2);
382 temp2 >>= SHIFT_BITS;
383 ip[3] = temp1 - temp2;
384
385
386 temp1 = xC5S3 * irot_input_x;
387 DOROUND(temp1);
388 temp1 >>= SHIFT_BITS;
389 temp2 = xC3S5 * irot_input_y;
390 DOROUND(temp2);
391 temp2 >>= SHIFT_BITS;
392 ip[5] = temp1 + temp2;
393
394 // Increment data pointer for next row
395 InputData += short_pitch;
396 ip += 8;
397 }
398
399 // Performed DCT on rows, now transform the columns
400 ip = InterData;
401 for (loop = 0; loop < 8; loop++) {
402 // Pre calculate some common sums and differences.
403 is07 = ip[0 * 8] + ip[7 * 8];
404 is12 = ip[1 * 8] + ip[2 * 8];
405 is34 = ip[3 * 8] + ip[4 * 8];
406 is56 = ip[5 * 8] + ip[6 * 8];
407
408 id07 = ip[0 * 8] - ip[7 * 8];
409 id12 = ip[1 * 8] - ip[2 * 8];
410 id34 = ip[3 * 8] - ip[4 * 8];
411 id56 = ip[5 * 8] - ip[6 * 8];
412
413 is0734 = is07 + is34;
414 is1256 = is12 + is56;
415
416 // Pre-Calculate some common product terms
417 icommon_product1 = xC4S4 * (is12 - is56);
418 icommon_product2 = xC4S4 * (id12 + id56);
419 DOROUND(icommon_product1)
420 DOROUND(icommon_product2)
421 icommon_product1 >>= SHIFT_BITS;
422 icommon_product2 >>= SHIFT_BITS;
423
424
425 temp1 = xC4S4 * (is0734 + is1256);
426 temp2 = xC4S4 * (is0734 - is1256);
427 DOROUND(temp1);
428 DOROUND(temp2);
429 temp1 >>= SHIFT_BITS;
430
431 temp2 >>= SHIFT_BITS;
432 op[0 * 8] = (temp1 + FINAL_ROUNDING) >> FINAL_SHIFT;
433 op[4 * 8] = (temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
434
435 // Define inputs to rotation for outputs 2 and 6
436 irot_input_x = id12 - id56;
437 irot_input_y = is07 - is34;
438
439 // Apply rotation for outputs 2 and 6.
440 temp1 = xC6S2 * irot_input_x;
441 DOROUND(temp1);
442 temp1 >>= SHIFT_BITS;
443 temp2 = xC2S6 * irot_input_y;
444 DOROUND(temp2);
445 temp2 >>= SHIFT_BITS;
446 op[2 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
447
448 temp1 = xC6S2 * irot_input_y;
449 DOROUND(temp1);
450 temp1 >>= SHIFT_BITS;
451 temp2 = xC2S6 * irot_input_x;
452 DOROUND(temp2);
453 temp2 >>= SHIFT_BITS;
454 op[6 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
455
456 // Define inputs to rotation for outputs 1 and 7
457 irot_input_x = icommon_product1 + id07;
458 irot_input_y = -(id34 + icommon_product2);
459
460 // Apply rotation for outputs 1 and 7.
461 temp1 = xC1S7 * irot_input_x;
462 DOROUND(temp1);
463 temp1 >>= SHIFT_BITS;
464 temp2 = xC7S1 * irot_input_y;
465 DOROUND(temp2);
466 temp2 >>= SHIFT_BITS;
467 op[1 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
468
469 temp1 = xC7S1 * irot_input_x;
470 DOROUND(temp1);
471 temp1 >>= SHIFT_BITS;
472 temp2 = xC1S7 * irot_input_y;
473 DOROUND(temp2);
474 temp2 >>= SHIFT_BITS;
475 op[7 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
476
477 // Define inputs to rotation for outputs 3 and 5
478 irot_input_x = id07 - icommon_product1;
479 irot_input_y = id34 - icommon_product2;
480
481 // Apply rotation for outputs 3 and 5.
482 temp1 = xC3S5 * irot_input_x;
483 DOROUND(temp1);
484 temp1 >>= SHIFT_BITS;
485 temp2 = xC5S3 * irot_input_y;
486 DOROUND(temp2);
487 temp2 >>= SHIFT_BITS;
488 op[3 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
489
490
491 temp1 = xC5S3 * irot_input_x;
492 DOROUND(temp1);
493 temp1 >>= SHIFT_BITS;
494 temp2 = xC3S5 * irot_input_y;
495 DOROUND(temp2);
496 temp2 >>= SHIFT_BITS;
497 op[5 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;
498
499 // Increment data pointer for next column.
500 ip++;
501 op++;
502 }
503 }
504
505 void vp9_short_fhaar2x2_c(short *input, short *output, int pitch) {
506 /* [1 1; 1 -1] orthogonal transform */
507 /* use position: 0,1, 4, 8 */
508 int i;
509 short *ip1 = input;
510 short *op1 = output;
511 for (i = 0; i < 16; i++) {
512 op1[i] = 0;
513 }
514
515 op1[0] = (ip1[0] + ip1[1] + ip1[4] + ip1[8] + 1) >> 1;
516 op1[1] = (ip1[0] - ip1[1] + ip1[4] - ip1[8]) >> 1;
517 op1[4] = (ip1[0] + ip1[1] - ip1[4] - ip1[8]) >> 1;
518 op1[8] = (ip1[0] - ip1[1] - ip1[4] + ip1[8]) >> 1;
519 }
520
521 /* For test */
522 #define TEST_INT 1
523 #if TEST_INT
524 #define vp9_fht_int_c vp9_fht_c
525 #else
526 #define vp9_fht_float_c vp9_fht_c
527 #endif
528
529 void vp9_fht_float_c(const int16_t *input, int pitch, int16_t *output,
530 TX_TYPE tx_type, int tx_dim) {
531 vp9_clear_system_state(); // Make it simd safe : __asm emms;
532 {
533 int i, j, k;
534 float bufa[256], bufb[256]; // buffers are for floating-point test purpose
535 // the implementation could be simplified in
536 // conjunction with integer transform
537 const int16_t *ip = input;
538 int16_t *op = output;
539
540 float *pfa = &bufa[0];
541 float *pfb = &bufb[0];
542
543 // pointers to vertical and horizontal transforms
544 const float *ptv, *pth;
545
546 assert(tx_type != DCT_DCT);
547 // load and convert residual array into floating-point
548 for (j = 0; j < tx_dim; j++) {
549 for (i = 0; i < tx_dim; i++) {
550 pfa[i] = (float)ip[i];
551 }
552 pfa += tx_dim;
553 ip += pitch / 2;
554 }
555
556 // vertical transformation
557 pfa = &bufa[0];
558 pfb = &bufb[0];
559
560 switch (tx_type) {
561 case ADST_ADST :
562 case ADST_DCT :
563 ptv = (tx_dim == 4) ? &adst_4[0] :
564 ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);
565 break;
566
567 default :
568 ptv = (tx_dim == 4) ? &dct_4[0] :
569 ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
570 break;
571 }
572
573 for (j = 0; j < tx_dim; j++) {
574 for (i = 0; i < tx_dim; i++) {
575 pfb[i] = 0;
576 for (k = 0; k < tx_dim; k++) {
577 pfb[i] += ptv[k] * pfa[(k * tx_dim)];
578 }
579 pfa += 1;
580 }
581 pfb += tx_dim;
582 ptv += tx_dim;
583 pfa = &bufa[0];
584 }
585
586 // horizontal transformation
587 pfa = &bufa[0];
588 pfb = &bufb[0];
589
590 switch (tx_type) {
591 case ADST_ADST :
592 case DCT_ADST :
593 pth = (tx_dim == 4) ? &adst_4[0] :
594 ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);
595 break;
596
597 default :
598 pth = (tx_dim == 4) ? &dct_4[0] :
599 ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
600 break;
601 }
602
603 for (j = 0; j < tx_dim; j++) {
604 for (i = 0; i < tx_dim; i++) {
605 pfa[i] = 0;
606 for (k = 0; k < tx_dim; k++) {
607 pfa[i] += pfb[k] * pth[k];
608 }
609 pth += tx_dim;
610 }
611
612 pfa += tx_dim;
613 pfb += tx_dim;
614 // pth -= tx_dim * tx_dim;
615
616 switch (tx_type) {
617 case ADST_ADST :
618 case DCT_ADST :
619 pth = (tx_dim == 4) ? &adst_4[0] :
620 ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);
621 break;
622
623 default :
624 pth = (tx_dim == 4) ? &dct_4[0] :
625 ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
626 break;
627 }
628 }
629
630 // convert to short integer format and load BLOCKD buffer
631 op = output;
632 pfa = &bufa[0];
633
634 for (j = 0; j < tx_dim; j++) {
635 for (i = 0; i < tx_dim; i++) {
636 op[i] = (pfa[i] > 0 ) ? (int16_t)( 8 * pfa[i] + 0.49) :
637 -(int16_t)(- 8 * pfa[i] + 0.49);
638 }
639 op += tx_dim;
640 pfa += tx_dim;
641 }
642 }
643 vp9_clear_system_state(); // Make it simd safe : __asm emms;
644 }
645
646 /* Converted the transforms to integer form. */
647 #define VERTICAL_SHIFT 11
648 #define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1)
649 #define HORIZONTAL_SHIFT 16
650 #define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1)
651 void vp9_fht_int_c(const int16_t *input, int pitch, int16_t *output,
652 TX_TYPE tx_type, int tx_dim) {
653 int i, j, k;
654 int16_t imbuf[256];
655
656 const int16_t *ip = input;
657 int16_t *op = output;
658 int16_t *im = &imbuf[0];
659
660 /* pointers to vertical and horizontal transforms. */
661 const int16_t *ptv = NULL, *pth = NULL;
662
663 switch (tx_type) {
664 case ADST_ADST :
665 ptv = pth = (tx_dim == 4) ? &adst_i4[0]
666 : ((tx_dim == 8) ? &adst_i8[0]
667 : &adst_i16[0]);
668 break;
669 case ADST_DCT :
670 ptv = (tx_dim == 4) ? &adst_i4[0]
671 : ((tx_dim == 8) ? &adst_i8[0] : &adst_i16[0]);
672 pth = (tx_dim == 4) ? &dct_i4[0]
673 : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);
674 break;
675 case DCT_ADST :
676 ptv = (tx_dim == 4) ? &dct_i4[0]
677 : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);
678 pth = (tx_dim == 4) ? &adst_i4[0]
679 : ((tx_dim == 8) ? &adst_i8[0] : &adst_i16[0]);
680 break;
681 case DCT_DCT :
682 ptv = pth = (tx_dim == 4) ? &dct_i4[0]
683 : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);
684 break;
685 default:
686 assert(0);
687 break;
688 }
689
690 /* vertical transformation */
691 for (j = 0; j < tx_dim; j++) {
692 for (i = 0; i < tx_dim; i++) {
693 int temp = 0;
694
695 for (k = 0; k < tx_dim; k++) {
696 temp += ptv[k] * ip[(k * (pitch >> 1))];
697 }
698
699 im[i] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT);
700 ip++;
701 }
702 im += tx_dim; // 16
703 ptv += tx_dim;
704 ip = input;
705 }
706
707 /* horizontal transformation */
708 im = &imbuf[0];
709
710 for (j = 0; j < tx_dim; j++) {
711 const int16_t *pthc = pth;
712
713 for (i = 0; i < tx_dim; i++) {
714 int temp = 0;
715
716 for (k = 0; k < tx_dim; k++) {
717 temp += im[k] * pthc[k];
718 }
719
720 op[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT);
721 pthc += tx_dim;
722 }
723
724 im += tx_dim; // 16
725 op += tx_dim;
726 }
727 }
728
729 void vp9_short_fdct4x4_c(short *input, short *output, int pitch) {
730 int i;
731 int a1, b1, c1, d1;
732 short *ip = input;
733 short *op = output;
734
735 for (i = 0; i < 4; i++) {
736 a1 = ((ip[0] + ip[3]) << 5);
737 b1 = ((ip[1] + ip[2]) << 5);
738 c1 = ((ip[1] - ip[2]) << 5);
739 d1 = ((ip[0] - ip[3]) << 5);
740
741 op[0] = a1 + b1;
742 op[2] = a1 - b1;
743
744 op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12;
745 op[3] = (d1 * 2217 - c1 * 5352 + 7500) >> 12;
746
747 ip += pitch / 2;
748 op += 4;
749
750 }
751 ip = output;
752 op = output;
753 for (i = 0; i < 4; i++) {
754 a1 = ip[0] + ip[12];
755 b1 = ip[4] + ip[8];
756 c1 = ip[4] - ip[8];
757 d1 = ip[0] - ip[12];
758
759 op[0] = (a1 + b1 + 7) >> 4;
760 op[8] = (a1 - b1 + 7) >> 4;
761
762 op[4] = ((c1 * 2217 + d1 * 5352 + 12000) >> 16) + (d1 != 0);
763 op[12] = (d1 * 2217 - c1 * 5352 + 51000) >> 16;
764
765 ip++;
766 op++;
767 }
768 }
769
770 void vp9_short_fdct8x4_c(short *input, short *output, int pitch)
771 {
772 vp9_short_fdct4x4_c(input, output, pitch);
773 vp9_short_fdct4x4_c(input + 4, output + 16, pitch);
774 }
775
776 void vp9_short_walsh4x4_c(short *input, short *output, int pitch) {
777 int i;
778 int a1, b1, c1, d1;
779 short *ip = input;
780 short *op = output;
781 int pitch_short = pitch >> 1;
782
783 for (i = 0; i < 4; i++) {
784 a1 = ip[0 * pitch_short] + ip[3 * pitch_short];
785 b1 = ip[1 * pitch_short] + ip[2 * pitch_short];
786 c1 = ip[1 * pitch_short] - ip[2 * pitch_short];
787 d1 = ip[0 * pitch_short] - ip[3 * pitch_short];
788
789 op[0] = (a1 + b1 + 1) >> 1;
790 op[4] = (c1 + d1) >> 1;
791 op[8] = (a1 - b1) >> 1;
792 op[12] = (d1 - c1) >> 1;
793
794 ip++;
795 op++;
796 }
797 ip = output;
798 op = output;
799
800 for (i = 0; i < 4; i++) {
801 a1 = ip[0] + ip[3];
802 b1 = ip[1] + ip[2];
803 c1 = ip[1] - ip[2];
804 d1 = ip[0] - ip[3];
805
806 op[0] = (a1 + b1 + 1) >> 1;
807 op[1] = (c1 + d1) >> 1;
808 op[2] = (a1 - b1) >> 1;
809 op[3] = (d1 - c1) >> 1;
810
811 ip += 4;
812 op += 4;
813 }
814 }
815
816 #if CONFIG_LOSSLESS
817 void vp9_short_walsh4x4_lossless_c(short *input, short *output, int pitch) {
818 int i;
819 int a1, b1, c1, d1;
820 short *ip = input;
821 short *op = output;
822 int pitch_short = pitch >> 1;
823
824 for (i = 0; i < 4; i++) {
825 a1 = (ip[0 * pitch_short] + ip[3 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;
826 b1 = (ip[1 * pitch_short] + ip[2 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;
827 c1 = (ip[1 * pitch_short] - ip[2 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;
828 d1 = (ip[0 * pitch_short] - ip[3 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;
829
830 op[0] = (a1 + b1 + 1) >> 1;
831 op[4] = (c1 + d1) >> 1;
832 op[8] = (a1 - b1) >> 1;
833 op[12] = (d1 - c1) >> 1;
834
835 ip++;
836 op++;
837 }
838 ip = output;
839 op = output;
840
841 for (i = 0; i < 4; i++) {
842 a1 = ip[0] + ip[3];
843 b1 = ip[1] + ip[2];
844 c1 = ip[1] - ip[2];
845 d1 = ip[0] - ip[3];
846
847 op[0] = ((a1 + b1 + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
848 op[1] = ((c1 + d1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
849 op[2] = ((a1 - b1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
850 op[3] = ((d1 - c1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
851
852 ip += 4;
853 op += 4;
854 }
855 }
856
857 void vp9_short_walsh4x4_x8_c(short *input, short *output, int pitch) {
858 int i;
859 int a1, b1, c1, d1;
860 short *ip = input;
861 short *op = output;
862 int pitch_short = pitch >> 1;
863
864 for (i = 0; i < 4; i++) {
865 a1 = ip[0 * pitch_short] + ip[3 * pitch_short];
866 b1 = ip[1 * pitch_short] + ip[2 * pitch_short];
867 c1 = ip[1 * pitch_short] - ip[2 * pitch_short];
868 d1 = ip[0 * pitch_short] - ip[3 * pitch_short];
869
870 op[0] = (a1 + b1 + 1) >> 1;
871 op[4] = (c1 + d1) >> 1;
872 op[8] = (a1 - b1) >> 1;
873 op[12] = (d1 - c1) >> 1;
874
875 ip++;
876 op++;
877 }
878 ip = output;
879 op = output;
880
881 for (i = 0; i < 4; i++) {
882 a1 = ip[0] + ip[3];
883 b1 = ip[1] + ip[2];
884 c1 = ip[1] - ip[2];
885 d1 = ip[0] - ip[3];
886
887 op[0] = ((a1 + b1 + 1) >> 1) << WHT_UPSCALE_FACTOR;
888 op[1] = ((c1 + d1) >> 1) << WHT_UPSCALE_FACTOR;
889 op[2] = ((a1 - b1) >> 1) << WHT_UPSCALE_FACTOR;
890 op[3] = ((d1 - c1) >> 1) << WHT_UPSCALE_FACTOR;
891
892 ip += 4;
893 op += 4;
894 }
895 }
896
897 void vp9_short_walsh8x4_x8_c(short *input, short *output, int pitch) {
898 vp9_short_walsh4x4_x8_c(input, output, pitch);
899 vp9_short_walsh4x4_x8_c(input + 4, output + 16, pitch);
900 }
901 #endif
902
903 #define TEST_INT_16x16_DCT 1
904 #if !TEST_INT_16x16_DCT
905 static const double C1 = 0.995184726672197;
906 static const double C2 = 0.98078528040323;
907 static const double C3 = 0.956940335732209;
908 static const double C4 = 0.923879532511287;
909 static const double C5 = 0.881921264348355;
910 static const double C6 = 0.831469612302545;
911 static const double C7 = 0.773010453362737;
912 static const double C8 = 0.707106781186548;
913 static const double C9 = 0.634393284163646;
914 static const double C10 = 0.555570233019602;
915 static const double C11 = 0.471396736825998;
916 static const double C12 = 0.38268343236509;
917 static const double C13 = 0.290284677254462;
918 static const double C14 = 0.195090322016128;
919 static const double C15 = 0.098017140329561;
920
921 static void dct16x16_1d(double input[16], double output[16]) {
922 vp9_clear_system_state(); // Make it simd safe : __asm emms;
923 {
924 double step[16];
925 double intermediate[16];
926 double temp1, temp2;
927
928 // step 1
929 step[ 0] = input[0] + input[15];
930 step[ 1] = input[1] + input[14];
931 step[ 2] = input[2] + input[13];
932 step[ 3] = input[3] + input[12];
933 step[ 4] = input[4] + input[11];
934 step[ 5] = input[5] + input[10];
935 step[ 6] = input[6] + input[ 9];
936 step[ 7] = input[7] + input[ 8];
937 step[ 8] = input[7] - input[ 8];
938 step[ 9] = input[6] - input[ 9];
939 step[10] = input[5] - input[10];
940 step[11] = input[4] - input[11];
941 step[12] = input[3] - input[12];
942 step[13] = input[2] - input[13];
943 step[14] = input[1] - input[14];
944 step[15] = input[0] - input[15];
945
946 // step 2
947 output[0] = step[0] + step[7];
948 output[1] = step[1] + step[6];
949 output[2] = step[2] + step[5];
950 output[3] = step[3] + step[4];
951 output[4] = step[3] - step[4];
952 output[5] = step[2] - step[5];
953 output[6] = step[1] - step[6];
954 output[7] = step[0] - step[7];
955
956 temp1 = step[ 8]*C7;
957 temp2 = step[15]*C9;
958 output[ 8] = temp1 + temp2;
959
960 temp1 = step[ 9]*C11;
961 temp2 = step[14]*C5;
962 output[ 9] = temp1 - temp2;
963
964 temp1 = step[10]*C3;
965 temp2 = step[13]*C13;
966 output[10] = temp1 + temp2;
967
968 temp1 = step[11]*C15;
969 temp2 = step[12]*C1;
970 output[11] = temp1 - temp2;
971
972 temp1 = step[11]*C1;
973 temp2 = step[12]*C15;
974 output[12] = temp2 + temp1;
975
976 temp1 = step[10]*C13;
977 temp2 = step[13]*C3;
978 output[13] = temp2 - temp1;
979
980 temp1 = step[ 9]*C5;
981 temp2 = step[14]*C11;
982 output[14] = temp2 + temp1;
983
984 temp1 = step[ 8]*C9;
985 temp2 = step[15]*C7;
986 output[15] = temp2 - temp1;
987
988 // step 3
989 step[ 0] = output[0] + output[3];
990 step[ 1] = output[1] + output[2];
991 step[ 2] = output[1] - output[2];
992 step[ 3] = output[0] - output[3];
993
994 temp1 = output[4]*C14;
995 temp2 = output[7]*C2;
996 step[ 4] = temp1 + temp2;
997
998 temp1 = output[5]*C10;
999 temp2 = output[6]*C6;
1000 step[ 5] = temp1 + temp2;
1001
1002 temp1 = output[5]*C6;
1003 temp2 = output[6]*C10;
1004 step[ 6] = temp2 - temp1;
1005
1006 temp1 = output[4]*C2;
1007 temp2 = output[7]*C14;
1008 step[ 7] = temp2 - temp1;
1009
1010 step[ 8] = output[ 8] + output[11];
1011 step[ 9] = output[ 9] + output[10];
1012 step[10] = output[ 9] - output[10];
1013 step[11] = output[ 8] - output[11];
1014
1015 step[12] = output[12] + output[15];
1016 step[13] = output[13] + output[14];
1017 step[14] = output[13] - output[14];
1018 step[15] = output[12] - output[15];
1019
1020 // step 4
1021 output[ 0] = (step[ 0] + step[ 1]);
1022 output[ 8] = (step[ 0] - step[ 1]);
1023
1024 temp1 = step[2]*C12;
1025 temp2 = step[3]*C4;
1026 temp1 = temp1 + temp2;
1027 output[ 4] = 2*(temp1*C8);
1028
1029 temp1 = step[2]*C4;
1030 temp2 = step[3]*C12;
1031 temp1 = temp2 - temp1;
1032 output[12] = 2*(temp1*C8);
1033
1034 output[ 2] = 2*((step[4] + step[ 5])*C8);
1035 output[14] = 2*((step[7] - step[ 6])*C8);
1036
1037 temp1 = step[4] - step[5];
1038 temp2 = step[6] + step[7];
1039 output[ 6] = (temp1 + temp2);
1040 output[10] = (temp1 - temp2);
1041
1042 intermediate[8] = step[8] + step[14];
1043 intermediate[9] = step[9] + step[15];
1044
1045 temp1 = intermediate[8]*C12;
1046 temp2 = intermediate[9]*C4;
1047 temp1 = temp1 - temp2;
1048 output[3] = 2*(temp1*C8);
1049
1050 temp1 = intermediate[8]*C4;
1051 temp2 = intermediate[9]*C12;
1052 temp1 = temp2 + temp1;
1053 output[13] = 2*(temp1*C8);
1054
1055 output[ 9] = 2*((step[10] + step[11])*C8);
1056
1057 intermediate[11] = step[10] - step[11];
1058 intermediate[12] = step[12] + step[13];
1059 intermediate[13] = step[12] - step[13];
1060 intermediate[14] = step[ 8] - step[14];
1061 intermediate[15] = step[ 9] - step[15];
1062
1063 output[15] = (intermediate[11] + intermediate[12]);
1064 output[ 1] = -(intermediate[11] - intermediate[12]);
1065
1066 output[ 7] = 2*(intermediate[13]*C8);
1067
1068 temp1 = intermediate[14]*C12;
1069 temp2 = intermediate[15]*C4;
1070 temp1 = temp1 - temp2;
1071 output[11] = -2*(temp1*C8);
1072
1073 temp1 = intermediate[14]*C4;
1074 temp2 = intermediate[15]*C12;
1075 temp1 = temp2 + temp1;
1076 output[ 5] = 2*(temp1*C8);
1077 }
1078 vp9_clear_system_state(); // Make it simd safe : __asm emms;
1079 }
1080
1081 void vp9_short_fdct16x16_c(short *input, short *out, int pitch) {
1082 vp9_clear_system_state(); // Make it simd safe : __asm emms;
1083 {
1084 int shortpitch = pitch >> 1;
1085 int i, j;
1086 double output[256];
1087 // First transform columns
1088 for (i = 0; i < 16; i++) {
1089 double temp_in[16], temp_out[16];
1090 for (j = 0; j < 16; j++)
1091 temp_in[j] = input[j*shortpitch + i];
1092 dct16x16_1d(temp_in, temp_out);
1093 for (j = 0; j < 16; j++)
1094 output[j*16 + i] = temp_out[j];
1095 }
1096 // Then transform rows
1097 for (i = 0; i < 16; ++i) {
1098 double temp_in[16], temp_out[16];
1099 for (j = 0; j < 16; ++j)
1100 temp_in[j] = output[j + i*16];
1101 dct16x16_1d(temp_in, temp_out);
1102 for (j = 0; j < 16; ++j)
1103 output[j + i*16] = temp_out[j];
1104 }
1105 // Scale by some magic number
1106 for (i = 0; i < 256; i++)
1107 out[i] = (short)round(output[i]/2);
1108 }
1109 vp9_clear_system_state(); // Make it simd safe : __asm emms;
1110 }
1111
1112 #else
1113 static const int16_t C1 = 16305;
1114 static const int16_t C2 = 16069;
1115 static const int16_t C3 = 15679;
1116 static const int16_t C4 = 15137;
1117 static const int16_t C5 = 14449;
1118 static const int16_t C6 = 13623;
1119 static const int16_t C7 = 12665;
1120 static const int16_t C8 = 11585;
1121 static const int16_t C9 = 10394;
1122 static const int16_t C10 = 9102;
1123 static const int16_t C11 = 7723;
1124 static const int16_t C12 = 6270;
1125 static const int16_t C13 = 4756;
1126 static const int16_t C14 = 3196;
1127 static const int16_t C15 = 1606;
1128
1129 #define RIGHT_SHIFT 14
1130 #define ROUNDING (1 << (RIGHT_SHIFT - 1))
1131
1132 static void dct16x16_1d(int16_t input[16], int16_t output[16],
1133 int last_shift_bits) {
1134 int16_t step[16];
1135 int intermediate[16];
1136 int temp1, temp2;
1137 int final_shift = RIGHT_SHIFT;
1138 int final_rounding = ROUNDING;
1139 int output_shift = 0;
1140 int output_rounding = 0;
1141
1142 final_shift += last_shift_bits;
1143 if (final_shift > 0)
1144 final_rounding = 1 << (final_shift - 1);
1145
1146 output_shift += last_shift_bits;
1147 if (output_shift > 0)
1148 output_rounding = 1 << (output_shift - 1);
1149
1150 // step 1
1151 step[ 0] = input[0] + input[15];
1152 step[ 1] = input[1] + input[14];
1153 step[ 2] = input[2] + input[13];
1154 step[ 3] = input[3] + input[12];
1155 step[ 4] = input[4] + input[11];
1156 step[ 5] = input[5] + input[10];
1157 step[ 6] = input[6] + input[ 9];
1158 step[ 7] = input[7] + input[ 8];
1159 step[ 8] = input[7] - input[ 8];
1160 step[ 9] = input[6] - input[ 9];
1161 step[10] = input[5] - input[10];
1162 step[11] = input[4] - input[11];
1163 step[12] = input[3] - input[12];
1164 step[13] = input[2] - input[13];
1165 step[14] = input[1] - input[14];
1166 step[15] = input[0] - input[15];
1167
1168 // step 2
1169 output[0] = step[0] + step[7];
1170 output[1] = step[1] + step[6];
1171 output[2] = step[2] + step[5];
1172 output[3] = step[3] + step[4];
1173 output[4] = step[3] - step[4];
1174 output[5] = step[2] - step[5];
1175 output[6] = step[1] - step[6];
1176 output[7] = step[0] - step[7];
1177
1178 temp1 = step[ 8] * C7;
1179 temp2 = step[15] * C9;
1180 output[ 8] = (temp1 + temp2 + ROUNDING) >> RIGHT_SHIFT;
1181
1182 temp1 = step[ 9] * C11;
1183 temp2 = step[14] * C5;
1184 output[ 9] = (temp1 - temp2 + ROUNDING) >> RIGHT_SHIFT;
1185
1186 temp1 = step[10] * C3;
1187 temp2 = step[13] * C13;
1188 output[10] = (temp1 + temp2 + ROUNDING) >> RIGHT_SHIFT;
1189
1190 temp1 = step[11] * C15;
1191 temp2 = step[12] * C1;
1192 output[11] = (temp1 - temp2 + ROUNDING) >> RIGHT_SHIFT;
1193
1194 temp1 = step[11] * C1;
1195 temp2 = step[12] * C15;
1196 output[12] = (temp2 + temp1 + ROUNDING) >> RIGHT_SHIFT;
1197
1198 temp1 = step[10] * C13;
1199 temp2 = step[13] * C3;
1200 output[13] = (temp2 - temp1 + ROUNDING) >> RIGHT_SHIFT;
1201
1202 temp1 = step[ 9] * C5;
1203 temp2 = step[14] * C11;
1204 output[14] = (temp2 + temp1 + ROUNDING) >> RIGHT_SHIFT;
1205
1206 temp1 = step[ 8] * C9;
1207 temp2 = step[15] * C7;
1208 output[15] = (temp2 - temp1 + ROUNDING) >> RIGHT_SHIFT;
1209
1210 // step 3
1211 step[ 0] = output[0] + output[3];
1212 step[ 1] = output[1] + output[2];
1213 step[ 2] = output[1] - output[2];
1214 step[ 3] = output[0] - output[3];
1215
1216 temp1 = output[4] * C14;
1217 temp2 = output[7] * C2;
1218 step[ 4] = (temp1 + temp2 + ROUNDING) >> RIGHT_SHIFT;
1219
1220 temp1 = output[5] * C10;
1221 temp2 = output[6] * C6;
1222 step[ 5] = (temp1 + temp2 + ROUNDING) >> RIGHT_SHIFT;
1223
1224 temp1 = output[5] * C6;
1225 temp2 = output[6] * C10;
1226 step[ 6] = (temp2 - temp1 + ROUNDING) >> RIGHT_SHIFT;
1227
1228 temp1 = output[4] * C2;
1229 temp2 = output[7] * C14;
1230 step[ 7] = (temp2 - temp1 + ROUNDING) >> RIGHT_SHIFT;
1231
1232 step[ 8] = output[ 8] + output[11];
1233 step[ 9] = output[ 9] + output[10];
1234 step[10] = output[ 9] - output[10];
1235 step[11] = output[ 8] - output[11];
1236
1237 step[12] = output[12] + output[15];
1238 step[13] = output[13] + output[14];
1239 step[14] = output[13] - output[14];
1240 step[15] = output[12] - output[15];
1241
1242 // step 4
1243 output[ 0] = (step[ 0] + step[ 1] + output_rounding) >> output_shift;
1244 output[ 8] = (step[ 0] - step[ 1] + output_rounding) >> output_shift;
1245
1246 temp1 = step[2] * C12;
1247 temp2 = step[3] * C4;
1248 temp1 = (temp1 + temp2 + final_rounding) >> final_shift;
1249 output[ 4] = (2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT;
1250
1251 temp1 = step[2] * C4;
1252 temp2 = step[3] * C12;
1253 temp1 = (temp2 - temp1 + final_rounding) >> final_shift;
1254 output[12] = (2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT;
1255
1256 output[ 2] = (2 * ((step[4] + step[ 5]) * C8) + final_rounding)
1257 >> final_shift;
1258 output[14] = (2 * ((step[7] - step[ 6]) * C8) + final_rounding)
1259 >> final_shift;
1260
1261 temp1 = step[4] - step[5];
1262 temp2 = step[6] + step[7];
1263 output[ 6] = (temp1 + temp2 + output_rounding) >> output_shift;
1264 output[10] = (temp1 - temp2 + output_rounding) >> output_shift;
1265
1266 intermediate[8] = step[8] + step[14];
1267 intermediate[9] = step[9] + step[15];
1268
1269 temp1 = intermediate[8] * C12;
1270 temp2 = intermediate[9] * C4;
1271 temp1 = (temp1 - temp2 + final_rounding) >> final_shift;
1272 output[3] = (2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT;
1273
1274 temp1 = intermediate[8] * C4;
1275 temp2 = intermediate[9] * C12;
1276 temp1 = (temp2 + temp1 + final_rounding) >> final_shift;
1277 output[13] = (2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT;
1278
1279 output[ 9] = (2 * ((step[10] + step[11]) * C8) + final_rounding)
1280 >> final_shift;
1281
1282 intermediate[11] = step[10] - step[11];
1283 intermediate[12] = step[12] + step[13];
1284 intermediate[13] = step[12] - step[13];
1285 intermediate[14] = step[ 8] - step[14];
1286 intermediate[15] = step[ 9] - step[15];
1287
1288 output[15] = (intermediate[11] + intermediate[12] + output_rounding)
1289 >> output_shift;
1290 output[ 1] = -(intermediate[11] - intermediate[12] + output_rounding)
1291 >> output_shift;
1292
1293 output[ 7] = (2 * (intermediate[13] * C8) + final_rounding) >> final_shift;
1294
1295 temp1 = intermediate[14] * C12;
1296 temp2 = intermediate[15] * C4;
1297 temp1 = (temp1 - temp2 + final_rounding) >> final_shift;
1298 output[11] = (-2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT;
1299
1300 temp1 = intermediate[14] * C4;
1301 temp2 = intermediate[15] * C12;
1302 temp1 = (temp2 + temp1 + final_rounding) >> final_shift;
1303 output[ 5] = (2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT;
1304 }
1305
1306 void vp9_short_fdct16x16_c(int16_t *input, int16_t *out, int pitch) {
1307 int shortpitch = pitch >> 1;
1308 int i, j;
1309 int16_t output[256];
1310 int16_t *outptr = &output[0];
1311
1312 // First transform columns
1313 for (i = 0; i < 16; i++) {
1314 int16_t temp_in[16];
1315 int16_t temp_out[16];
1316 for (j = 0; j < 16; j++)
1317 temp_in[j] = input[j * shortpitch + i];
1318 dct16x16_1d(temp_in, temp_out, 0);
1319 for (j = 0; j < 16; j++)
1320 output[j * 16 + i] = temp_out[j];
1321 }
1322
1323 // Then transform rows
1324 for (i = 0; i < 16; ++i) {
1325 dct16x16_1d(outptr, out, 1);
1326 outptr += 16;
1327 out += 16;
1328 }
1329 }
1330 #undef RIGHT_SHIFT
1331 #undef ROUNDING
1332 #endif
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698