Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(99)

Side by Side Diff: source/libvpx/vp9/common/vp9_idctllm.c

Issue 11555023: libvpx: Add VP9 decoder. (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 8 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
Property Changes:
Added: svn:eol-style
+ LF
OLDNEW
(Empty)
1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12 /****************************************************************************
13 * Notes:
14 *
15 * This implementation makes use of 16 bit fixed point verio of two multiply
16 * constants:
17 * 1. sqrt(2) * cos (pi/8)
18 * 2. sqrt(2) * sin (pi/8)
19 * Becuase the first constant is bigger than 1, to maintain the same 16 bit
20 * fixed point precision as the second one, we use a trick of
21 * x * a = x + x*(a-1)
22 * so
23 * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
24 **************************************************************************/
25 #include <assert.h>
26 #include <math.h>
27 #include "vpx_ports/config.h"
28 #include "vp9/common/vp9_systemdependent.h"
29
30 #include "vp9/common/vp9_blockd.h"
31
32 static const int cospi8sqrt2minus1 = 20091;
33 static const int sinpi8sqrt2 = 35468;
34 static const int rounding = 0;
35
36 // TODO: these transforms can be further converted into integer forms
37 // for complexity optimization
38 static const float idct_4[16] = {
39 0.500000000000000, 0.653281482438188, 0.500000000000000, 0.2705980500730 99,
40 0.500000000000000, 0.270598050073099, -0.500000000000000, -0.6532814824381 88,
41 0.500000000000000, -0.270598050073099, -0.500000000000000, 0.6532814824381 88,
42 0.500000000000000, -0.653281482438188, 0.500000000000000, -0.2705980500730 99
43 };
44
45 static const float iadst_4[16] = {
46 0.228013428883779, 0.577350269189626, 0.656538502008139, 0.4285250731243 60,
47 0.428525073124360, 0.577350269189626, -0.228013428883779, -0.6565385020081 39,
48 0.577350269189626, 0, -0.577350269189626, 0.5773502691896 26,
49 0.656538502008139, -0.577350269189626, 0.428525073124359, -0.2280134288837 79
50 };
51
52 static const float idct_8[64] = {
53 0.353553390593274, 0.490392640201615, 0.461939766255643, 0.4157348061512 73,
54 0.353553390593274, 0.277785116509801, 0.191341716182545, 0.0975451610080 64,
55 0.353553390593274, 0.415734806151273, 0.191341716182545, -0.0975451610080 64,
56 -0.353553390593274, -0.490392640201615, -0.461939766255643, -0.2777851165098 01,
57 0.353553390593274, 0.277785116509801, -0.191341716182545, -0.4903926402016 15,
58 -0.353553390593274, 0.097545161008064, 0.461939766255643, 0.4157348061512 73,
59 0.353553390593274, 0.097545161008064, -0.461939766255643, -0.2777851165098 01,
60 0.353553390593274, 0.415734806151273, -0.191341716182545, -0.4903926402016 15,
61 0.353553390593274, -0.097545161008064, -0.461939766255643, 0.2777851165098 01,
62 0.353553390593274, -0.415734806151273, -0.191341716182545, 0.4903926402016 15,
63 0.353553390593274, -0.277785116509801, -0.191341716182545, 0.4903926402016 15,
64 -0.353553390593274, -0.097545161008064, 0.461939766255643, -0.4157348061512 73,
65 0.353553390593274, -0.415734806151273, 0.191341716182545, 0.0975451610080 64,
66 -0.353553390593274, 0.490392640201615, -0.461939766255643, 0.2777851165098 01,
67 0.353553390593274, -0.490392640201615, 0.461939766255643, -0.4157348061512 73,
68 0.353553390593274, -0.277785116509801, 0.191341716182545, -0.0975451610080 64
69 };
70
71 static const float iadst_8[64] = {
72 0.089131608307533, 0.255357107325376, 0.387095214016349, 0.4665539670857 85,
73 0.483002021635509, 0.434217976756762, 0.326790388032145, 0.1752279465957 35,
74 0.175227946595735, 0.434217976756762, 0.466553967085785, 0.2553571073253 76,
75 -0.089131608307533, -0.387095214016348, -0.483002021635509, -0.3267903880321 45,
76 0.255357107325376, 0.483002021635509, 0.175227946595735, -0.3267903880321 45,
77 -0.466553967085785, -0.089131608307533, 0.387095214016349, 0.4342179767567 62,
78 0.326790388032145, 0.387095214016349, -0.255357107325376, -0.4342179767567 62,
79 0.175227946595735, 0.466553967085786, -0.089131608307534, -0.4830020216355 09,
80 0.387095214016349, 0.175227946595735, -0.483002021635509, 0.0891316083075 33,
81 0.434217976756762, -0.326790388032145, -0.255357107325377, 0.4665539670857 85,
82 0.434217976756762, -0.089131608307533, -0.326790388032145, 0.4830020216355 09,
83 -0.255357107325376, -0.175227946595735, 0.466553967085785, -0.3870952140163 48,
84 0.466553967085785, -0.326790388032145, 0.089131608307533, 0.1752279465957 35,
85 -0.387095214016348, 0.483002021635509, -0.434217976756762, 0.2553571073253 76,
86 0.483002021635509, -0.466553967085785, 0.434217976756762, -0.3870952140163 48,
87 0.326790388032145, -0.255357107325375, 0.175227946595736, -0.0891316083075 32
88 };
89
90 static const int16_t idct_i4[16] = {
91 8192, 10703, 8192, 4433,
92 8192, 4433, -8192, -10703,
93 8192, -4433, -8192, 10703,
94 8192, -10703, 8192, -4433
95 };
96
97 static const int16_t iadst_i4[16] = {
98 3736, 9459, 10757, 7021,
99 7021, 9459, -3736, -10757,
100 9459, 0, -9459, 9459,
101 10757, -9459, 7021, -3736
102 };
103
104 static const int16_t idct_i8[64] = {
105 5793, 8035, 7568, 6811,
106 5793, 4551, 3135, 1598,
107 5793, 6811, 3135, -1598,
108 -5793, -8035, -7568, -4551,
109 5793, 4551, -3135, -8035,
110 -5793, 1598, 7568, 6811,
111 5793, 1598, -7568, -4551,
112 5793, 6811, -3135, -8035,
113 5793, -1598, -7568, 4551,
114 5793, -6811, -3135, 8035,
115 5793, -4551, -3135, 8035,
116 -5793, -1598, 7568, -6811,
117 5793, -6811, 3135, 1598,
118 -5793, 8035, -7568, 4551,
119 5793, -8035, 7568, -6811,
120 5793, -4551, 3135, -1598
121 };
122
123 static const int16_t iadst_i8[64] = {
124 1460, 4184, 6342, 7644,
125 7914, 7114, 5354, 2871,
126 2871, 7114, 7644, 4184,
127 -1460, -6342, -7914, -5354,
128 4184, 7914, 2871, -5354,
129 -7644, -1460, 6342, 7114,
130 5354, 6342, -4184, -7114,
131 2871, 7644, -1460, -7914,
132 6342, 2871, -7914, 1460,
133 7114, -5354, -4184, 7644,
134 7114, -1460, -5354, 7914,
135 -4184, -2871, 7644, -6342,
136 7644, -5354, 1460, 2871,
137 -6342, 7914, -7114, 4184,
138 7914, -7644, 7114, -6342,
139 5354, -4184, 2871, -1460
140 };
141
142 static float idct_16[256] = {
143 0.250000, 0.351851, 0.346760, 0.338330, 0.326641, 0.311806, 0.293969, 0 .273300,
144 0.250000, 0.224292, 0.196424, 0.166664, 0.135299, 0.102631, 0.068975, 0 .034654,
145 0.250000, 0.338330, 0.293969, 0.224292, 0.135299, 0.034654, -0.068975, -0 .166664,
146 -0.250000, -0.311806, -0.346760, -0.351851, -0.326641, -0.273300, -0.196424, -0 .102631,
147 0.250000, 0.311806, 0.196424, 0.034654, -0.135299, -0.273300, -0.346760, -0 .338330,
148 -0.250000, -0.102631, 0.068975, 0.224292, 0.326641, 0.351851, 0.293969, 0 .166664,
149 0.250000, 0.273300, 0.068975, -0.166664, -0.326641, -0.338330, -0.196424, 0 .034654,
150 0.250000, 0.351851, 0.293969, 0.102631, -0.135299, -0.311806, -0.346760, -0 .224292,
151 0.250000, 0.224292, -0.068975, -0.311806, -0.326641, -0.102631, 0.196424, 0 .351851,
152 0.250000, -0.034654, -0.293969, -0.338330, -0.135299, 0.166664, 0.346760, 0 .273300,
153 0.250000, 0.166664, -0.196424, -0.351851, -0.135299, 0.224292, 0.346760, 0 .102631,
154 -0.250000, -0.338330, -0.068975, 0.273300, 0.326641, 0.034654, -0.293969, -0 .311806,
155 0.250000, 0.102631, -0.293969, -0.273300, 0.135299, 0.351851, 0.068975, -0 .311806,
156 -0.250000, 0.166664, 0.346760, 0.034654, -0.326641, -0.224292, 0.196424, 0 .338330,
157 0.250000, 0.034654, -0.346760, -0.102631, 0.326641, 0.166664, -0.293969, -0 .224292,
158 0.250000, 0.273300, -0.196424, -0.311806, 0.135299, 0.338330, -0.068975, -0 .351851,
159 0.250000, -0.034654, -0.346760, 0.102631, 0.326641, -0.166664, -0.293969, 0 .224292,
160 0.250000, -0.273300, -0.196424, 0.311806, 0.135299, -0.338330, -0.068975, 0 .351851,
161 0.250000, -0.102631, -0.293969, 0.273300, 0.135299, -0.351851, 0.068975, 0 .311806,
162 -0.250000, -0.166664, 0.346760, -0.034654, -0.326641, 0.224292, 0.196424, -0 .338330,
163 0.250000, -0.166664, -0.196424, 0.351851, -0.135299, -0.224292, 0.346760, -0 .102631,
164 -0.250000, 0.338330, -0.068975, -0.273300, 0.326641, -0.034654, -0.293969, 0 .311806,
165 0.250000, -0.224292, -0.068975, 0.311806, -0.326641, 0.102631, 0.196424, -0 .351851,
166 0.250000, 0.034654, -0.293969, 0.338330, -0.135299, -0.166664, 0.346760, -0 .273300,
167 0.250000, -0.273300, 0.068975, 0.166664, -0.326641, 0.338330, -0.196424, -0 .034654,
168 0.250000, -0.351851, 0.293969, -0.102631, -0.135299, 0.311806, -0.346760, 0 .224292,
169 0.250000, -0.311806, 0.196424, -0.034654, -0.135299, 0.273300, -0.346760, 0 .338330,
170 -0.250000, 0.102631, 0.068975, -0.224292, 0.326641, -0.351851, 0.293969, -0 .166664,
171 0.250000, -0.338330, 0.293969, -0.224292, 0.135299, -0.034654, -0.068975, 0 .166664,
172 -0.250000, 0.311806, -0.346760, 0.351851, -0.326641, 0.273300, -0.196424, 0 .102631,
173 0.250000, -0.351851, 0.346760, -0.338330, 0.326641, -0.311806, 0.293969, -0 .273300,
174 0.250000, -0.224292, 0.196424, -0.166664, 0.135299, -0.102631, 0.068975, -0 .034654
175 };
176
177 static float iadst_16[256] = {
178 0.033094, 0.098087, 0.159534, 0.215215, 0.263118, 0.301511, 0.329007, 0 .344612,
179 0.347761, 0.338341, 0.316693, 0.283599, 0.240255, 0.188227, 0.129396, 0 .065889,
180 0.065889, 0.188227, 0.283599, 0.338341, 0.344612, 0.301511, 0.215215, 0 .098087,
181 -0.033094, -0.159534, -0.263118, -0.329007, -0.347761, -0.316693, -0.240255, -0 .129396,
182 0.098087, 0.263118, 0.344612, 0.316693, 0.188227, 0.000000, -0.188227, -0 .316693,
183 -0.344612, -0.263118, -0.098087, 0.098087, 0.263118, 0.344612, 0.316693, 0 .188227,
184 0.129396, 0.316693, 0.329007, 0.159534, -0.098087, -0.301511, -0.338341, -0 .188227,
185 0.065889, 0.283599, 0.344612, 0.215215, -0.033094, -0.263118, -0.347761, -0 .240255,
186 0.159534, 0.344612, 0.240255, -0.065889, -0.316693, -0.301511, -0.033094, 0 .263118,
187 0.338341, 0.129396, -0.188227, -0.347761, -0.215215, 0.098087, 0.329007, 0 .283599,
188 0.188227, 0.344612, 0.098087, -0.263118, -0.316693, -0.000000, 0.316693, 0 .263118,
189 -0.098087, -0.344612, -0.188227, 0.188227, 0.344612, 0.098087, -0.263118, -0 .316693,
190 0.215215, 0.316693, -0.065889, -0.347761, -0.098087, 0.301511, 0.240255, -0 .188227,
191 -0.329007, 0.033094, 0.344612, 0.129396, -0.283599, -0.263118, 0.159534, 0 .338341,
192 0.240255, 0.263118, -0.215215, -0.283599, 0.188227, 0.301511, -0.159534, -0 .316693,
193 0.129396, 0.329007, -0.098087, -0.338341, 0.065889, 0.344612, -0.033094, -0 .347761,
194 0.263118, 0.188227, -0.316693, -0.098087, 0.344612, 0.000000, -0.344612, 0 .098087,
195 0.316693, -0.188227, -0.263118, 0.263118, 0.188227, -0.316693, -0.098087, 0 .344612,
196 0.283599, 0.098087, -0.347761, 0.129396, 0.263118, -0.301511, -0.065889, 0 .344612,
197 -0.159534, -0.240255, 0.316693, 0.033094, -0.338341, 0.188227, 0.215215, -0 .329007,
198 0.301511, 0.000000, -0.301511, 0.301511, 0.000000, -0.301511, 0.301511, 0 .000000,
199 -0.301511, 0.301511, 0.000000, -0.301511, 0.301511, 0.000000, -0.301511, 0 .301511,
200 0.316693, -0.098087, -0.188227, 0.344612, -0.263118, -0.000000, 0.263118, -0 .344612,
201 0.188227, 0.098087, -0.316693, 0.316693, -0.098087, -0.188227, 0.344612, -0 .263118,
202 0.329007, -0.188227, -0.033094, 0.240255, -0.344612, 0.301511, -0.129396, -0 .098087,
203 0.283599, -0.347761, 0.263118, -0.065889, -0.159534, 0.316693, -0.338341, 0 .215215,
204 0.338341, -0.263118, 0.129396, 0.033094, -0.188227, 0.301511, -0.347761, 0 .316693,
205 -0.215215, 0.065889, 0.098087, -0.240255, 0.329007, -0.344612, 0.283599, -0 .159534,
206 0.344612, -0.316693, 0.263118, -0.188227, 0.098087, 0.000000, -0.098087, 0 .188227,
207 -0.263118, 0.316693, -0.344612, 0.344612, -0.316693, 0.263118, -0.188227, 0 .098087,
208 0.347761, -0.344612, 0.338341, -0.329007, 0.316693, -0.301511, 0.283599, -0 .263118,
209 0.240255, -0.215215, 0.188227, -0.159534, 0.129396, -0.098087, 0.065889, -0 .033094
210 };
211
212 static const int16_t idct_i16[256] = {
213 4096, 5765, 5681, 5543, 5352, 5109, 4816, 4478,
214 4096, 3675, 3218, 2731, 2217, 1682, 1130, 568,
215 4096, 5543, 4816, 3675, 2217, 568, -1130, -2731,
216 -4096, -5109, -5681, -5765, -5352, -4478, -3218, -1682,
217 4096, 5109, 3218, 568, -2217, -4478, -5681, -5543,
218 -4096, -1682, 1130, 3675, 5352, 5765, 4816, 2731,
219 4096, 4478, 1130, -2731, -5352, -5543, -3218, 568,
220 4096, 5765, 4816, 1682, -2217, -5109, -5681, -3675,
221 4096, 3675, -1130, -5109, -5352, -1682, 3218, 5765,
222 4096, -568, -4816, -5543, -2217, 2731, 5681, 4478,
223 4096, 2731, -3218, -5765, -2217, 3675, 5681, 1682,
224 -4096, -5543, -1130, 4478, 5352, 568, -4816, -5109,
225 4096, 1682, -4816, -4478, 2217, 5765, 1130, -5109,
226 -4096, 2731, 5681, 568, -5352, -3675, 3218, 5543,
227 4096, 568, -5681, -1682, 5352, 2731, -4816, -3675,
228 4096, 4478, -3218, -5109, 2217, 5543, -1130, -5765,
229 4096, -568, -5681, 1682, 5352, -2731, -4816, 3675,
230 4096, -4478, -3218, 5109, 2217, -5543, -1130, 5765,
231 4096, -1682, -4816, 4478, 2217, -5765, 1130, 5109,
232 -4096, -2731, 5681, -568, -5352, 3675, 3218, -5543,
233 4096, -2731, -3218, 5765, -2217, -3675, 5681, -1682,
234 -4096, 5543, -1130, -4478, 5352, -568, -4816, 5109,
235 4096, -3675, -1130, 5109, -5352, 1682, 3218, -5765,
236 4096, 568, -4816, 5543, -2217, -2731, 5681, -4478,
237 4096, -4478, 1130, 2731, -5352, 5543, -3218, -568,
238 4096, -5765, 4816, -1682, -2217, 5109, -5681, 3675,
239 4096, -5109, 3218, -568, -2217, 4478, -5681, 5543,
240 -4096, 1682, 1130, -3675, 5352, -5765, 4816, -2731,
241 4096, -5543, 4816, -3675, 2217, -568, -1130, 2731,
242 -4096, 5109, -5681, 5765, -5352, 4478, -3218, 1682,
243 4096, -5765, 5681, -5543, 5352, -5109, 4816, -4478,
244 4096, -3675, 3218, -2731, 2217, -1682, 1130, -568
245 };
246
247 static const int16_t iadst_i16[256] = {
248 542, 1607, 2614, 3526, 4311, 4940, 5390, 5646,
249 5698, 5543, 5189, 4646, 3936, 3084, 2120, 1080,
250 1080, 3084, 4646, 5543, 5646, 4940, 3526, 1607,
251 -542, -2614, -4311, -5390, -5698, -5189, -3936, -2120,
252 1607, 4311, 5646, 5189, 3084, 0, -3084, -5189,
253 -5646, -4311, -1607, 1607, 4311, 5646, 5189, 3084,
254 2120, 5189, 5390, 2614, -1607, -4940, -5543, -3084,
255 1080, 4646, 5646, 3526, -542, -4311, -5698, -3936,
256 2614, 5646, 3936, -1080, -5189, -4940, -542, 4311,
257 5543, 2120, -3084, -5698, -3526, 1607, 5390, 4646,
258 3084, 5646, 1607, -4311, -5189, 0, 5189, 4311,
259 -1607, -5646, -3084, 3084, 5646, 1607, -4311, -5189,
260 3526, 5189, -1080, -5698, -1607, 4940, 3936, -3084,
261 -5390, 542, 5646, 2120, -4646, -4311, 2614, 5543,
262 3936, 4311, -3526, -4646, 3084, 4940, -2614, -5189,
263 2120, 5390, -1607, -5543, 1080, 5646, -542, -5698,
264 4311, 3084, -5189, -1607, 5646, 0, -5646, 1607,
265 5189, -3084, -4311, 4311, 3084, -5189, -1607, 5646,
266 4646, 1607, -5698, 2120, 4311, -4940, -1080, 5646,
267 -2614, -3936, 5189, 542, -5543, 3084, 3526, -5390,
268 4940, 0, -4940, 4940, 0, -4940, 4940, 0,
269 -4940, 4940, 0, -4940, 4940, 0, -4940, 4940,
270 5189, -1607, -3084, 5646, -4311, 0, 4311, -5646,
271 3084, 1607, -5189, 5189, -1607, -3084, 5646, -4311,
272 5390, -3084, -542, 3936, -5646, 4940, -2120, -1607,
273 4646, -5698, 4311, -1080, -2614, 5189, -5543, 3526,
274 5543, -4311, 2120, 542, -3084, 4940, -5698, 5189,
275 -3526, 1080, 1607, -3936, 5390, -5646, 4646, -2614,
276 5646, -5189, 4311, -3084, 1607, 0, -1607, 3084,
277 -4311, 5189, -5646, 5646, -5189, 4311, -3084, 1607,
278 5698, -5646, 5543, -5390, 5189, -4940, 4646, -4311,
279 3936, -3526, 3084, -2614, 2120, -1607, 1080, -542
280 };
281
282 void vp9_ihtllm_float_c(const int16_t *input, int16_t *output, int pitch,
283 TX_TYPE tx_type, int tx_dim) {
284 vp9_clear_system_state(); // Make it simd safe : __asm emms;
285 {
286 int i, j, k;
287 float bufa[256], bufb[256]; // buffers are for floating-point test purpose
288 // the implementation could be simplified in
289 // conjunction with integer transform
290 const int16_t *ip = input;
291 int16_t *op = output;
292 int shortpitch = pitch >> 1;
293
294 float *pfa = &bufa[0];
295 float *pfb = &bufb[0];
296
297 // pointers to vertical and horizontal transforms
298 const float *ptv, *pth;
299
300 assert(tx_type != DCT_DCT);
301 // load and convert residual array into floating-point
302 for(j = 0; j < tx_dim; j++) {
303 for(i = 0; i < tx_dim; i++) {
304 pfa[i] = (float)ip[i];
305 }
306 pfa += tx_dim;
307 ip += tx_dim;
308 }
309
310 // vertical transformation
311 pfa = &bufa[0];
312 pfb = &bufb[0];
313
314 switch(tx_type) {
315 case ADST_ADST :
316 case ADST_DCT :
317 ptv = (tx_dim == 4) ? &iadst_4[0] :
318 ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
319 break;
320
321 default :
322 ptv = (tx_dim == 4) ? &idct_4[0] :
323 ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
324 break;
325 }
326
327 for(j = 0; j < tx_dim; j++) {
328 for(i = 0; i < tx_dim; i++) {
329 pfb[i] = 0 ;
330 for(k = 0; k < tx_dim; k++) {
331 pfb[i] += ptv[k] * pfa[(k * tx_dim)];
332 }
333 pfa += 1;
334 }
335
336 pfb += tx_dim;
337 ptv += tx_dim;
338 pfa = &bufa[0];
339 }
340
341 // horizontal transformation
342 pfa = &bufa[0];
343 pfb = &bufb[0];
344
345 switch(tx_type) {
346 case ADST_ADST :
347 case DCT_ADST :
348 pth = (tx_dim == 4) ? &iadst_4[0] :
349 ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
350 break;
351
352 default :
353 pth = (tx_dim == 4) ? &idct_4[0] :
354 ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
355 break;
356 }
357
358 for(j = 0; j < tx_dim; j++) {
359 for(i = 0; i < tx_dim; i++) {
360 pfa[i] = 0;
361 for(k = 0; k < tx_dim; k++) {
362 pfa[i] += pfb[k] * pth[k];
363 }
364 pth += tx_dim;
365 }
366
367 pfa += tx_dim;
368 pfb += tx_dim;
369
370 switch(tx_type) {
371 case ADST_ADST :
372 case DCT_ADST :
373 pth = (tx_dim == 4) ? &iadst_4[0] :
374 ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
375 break;
376
377 default :
378 pth = (tx_dim == 4) ? &idct_4[0] :
379 ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
380 break;
381 }
382 }
383
384 // convert to short integer format and load BLOCKD buffer
385 op = output;
386 pfa = &bufa[0];
387
388 for(j = 0; j < tx_dim; j++) {
389 for(i = 0; i < tx_dim; i++) {
390 op[i] = (pfa[i] > 0 ) ? (int16_t)( pfa[i] / 8 + 0.49) :
391 -(int16_t)( - pfa[i] / 8 + 0.49);
392 }
393
394 op += shortpitch;
395 pfa += tx_dim;
396 }
397 }
398 vp9_clear_system_state(); // Make it simd safe : __asm emms;
399 }
400
401 /* Converted the transforms to integer form. */
402 #define VERTICAL_SHIFT 14 // 16
403 #define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1)
404 #define HORIZONTAL_SHIFT 17 // 15
405 #define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1)
406 void vp9_ihtllm_c(const int16_t *input, int16_t *output, int pitch,
407 TX_TYPE tx_type, int tx_dim) {
408 int i, j, k;
409 int16_t imbuf[256];
410
411 const int16_t *ip = input;
412 int16_t *op = output;
413 int16_t *im = &imbuf[0];
414
415 /* pointers to vertical and horizontal transforms. */
416 const int16_t *ptv = NULL, *pth = NULL;
417 int shortpitch = pitch >> 1;
418
419 switch (tx_type) {
420 case ADST_ADST :
421 ptv = pth = (tx_dim == 4) ? &iadst_i4[0]
422 : ((tx_dim == 8) ? &iadst_i8[0]
423 : &iadst_i16[0]);
424 break;
425 case ADST_DCT :
426 ptv = (tx_dim == 4) ? &iadst_i4[0]
427 : ((tx_dim == 8) ? &iadst_i8[0] : &iadst_i16[0]);
428 pth = (tx_dim == 4) ? &idct_i4[0]
429 : ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]);
430 break;
431 case DCT_ADST :
432 ptv = (tx_dim == 4) ? &idct_i4[0]
433 : ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]);
434 pth = (tx_dim == 4) ? &iadst_i4[0]
435 : ((tx_dim == 8) ? &iadst_i8[0] : &iadst_i16[0]);
436 break;
437 case DCT_DCT :
438 ptv = pth = (tx_dim == 4) ? &idct_i4[0]
439 : ((tx_dim == 8) ? &idct_i8[0]
440 : &idct_i16[0]);
441 break;
442 default:
443 assert(0);
444 break;
445 }
446
447 /* vertical transformation */
448 for (j = 0; j < tx_dim; j++) {
449 for (i = 0; i < tx_dim; i++) {
450 int temp = 0;
451
452 for (k = 0; k < tx_dim; k++) {
453 temp += ptv[k] * ip[(k * tx_dim)];
454 }
455
456 im[i] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT);
457 ip++;
458 }
459 im += tx_dim; // 16
460 ptv += tx_dim;
461 ip = input;
462 }
463
464 /* horizontal transformation */
465 im = &imbuf[0];
466
467 for (j = 0; j < tx_dim; j++) {
468 const int16_t *pthc = pth;
469
470 for (i = 0; i < tx_dim; i++) {
471 int temp = 0;
472
473 for (k = 0; k < tx_dim; k++) {
474 temp += im[k] * pthc[k];
475 }
476
477 op[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT);
478 pthc += tx_dim;
479 }
480
481 im += tx_dim; // 16
482 op += shortpitch;
483 }
484 }
485
486 void vp9_short_idct4x4llm_c(short *input, short *output, int pitch) {
487 int i;
488 int a1, b1, c1, d1;
489
490 short *ip = input;
491 short *op = output;
492 int temp1, temp2;
493 int shortpitch = pitch >> 1;
494
495 for (i = 0; i < 4; i++) {
496 a1 = ip[0] + ip[8];
497 b1 = ip[0] - ip[8];
498
499 temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16;
500 temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16);
501 c1 = temp1 - temp2;
502
503 temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1 + rounding) >> 16);
504 temp2 = (ip[12] * sinpi8sqrt2 + rounding) >> 16;
505 d1 = temp1 + temp2;
506
507 op[shortpitch * 0] = a1 + d1;
508 op[shortpitch * 3] = a1 - d1;
509
510 op[shortpitch * 1] = b1 + c1;
511 op[shortpitch * 2] = b1 - c1;
512
513 ip++;
514 op++;
515 }
516
517 ip = output;
518 op = output;
519
520 for (i = 0; i < 4; i++) {
521 a1 = ip[0] + ip[2];
522 b1 = ip[0] - ip[2];
523
524 temp1 = (ip[1] * sinpi8sqrt2 + rounding) >> 16;
525 temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1 + rounding) >> 16);
526 c1 = temp1 - temp2;
527
528 temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1 + rounding) >> 16);
529 temp2 = (ip[3] * sinpi8sqrt2 + rounding) >> 16;
530 d1 = temp1 + temp2;
531
532 op[0] = (a1 + d1 + 16) >> 5;
533 op[3] = (a1 - d1 + 16) >> 5;
534
535 op[1] = (b1 + c1 + 16) >> 5;
536 op[2] = (b1 - c1 + 16) >> 5;
537
538 ip += shortpitch;
539 op += shortpitch;
540 }
541 }
542
543 void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch) {
544 int i;
545 int a1;
546 short *op = output;
547 int shortpitch = pitch >> 1;
548 a1 = ((input[0] + 16) >> 5);
549 for (i = 0; i < 4; i++) {
550 op[0] = a1;
551 op[1] = a1;
552 op[2] = a1;
553 op[3] = a1;
554 op += shortpitch;
555 }
556 }
557
558 void vp9_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
559 unsigned char *dst_ptr, int pitch, int stride) {
560 int a1 = ((input_dc + 16) >> 5);
561 int r, c;
562
563 for (r = 0; r < 4; r++) {
564 for (c = 0; c < 4; c++) {
565 int a = a1 + pred_ptr[c];
566
567 if (a < 0)
568 a = 0;
569
570 if (a > 255)
571 a = 255;
572
573 dst_ptr[c] = (unsigned char) a;
574 }
575
576 dst_ptr += stride;
577 pred_ptr += pitch;
578 }
579 }
580
581 void vp9_short_inv_walsh4x4_c(short *input, short *output) {
582 int i;
583 int a1, b1, c1, d1;
584 short *ip = input;
585 short *op = output;
586
587 for (i = 0; i < 4; i++) {
588 a1 = ((ip[0] + ip[3]));
589 b1 = ((ip[1] + ip[2]));
590 c1 = ((ip[1] - ip[2]));
591 d1 = ((ip[0] - ip[3]));
592
593 op[0] = (a1 + b1 + 1) >> 1;
594 op[1] = (c1 + d1) >> 1;
595 op[2] = (a1 - b1) >> 1;
596 op[3] = (d1 - c1) >> 1;
597
598 ip += 4;
599 op += 4;
600 }
601
602 ip = output;
603 op = output;
604 for (i = 0; i < 4; i++) {
605 a1 = ip[0] + ip[12];
606 b1 = ip[4] + ip[8];
607 c1 = ip[4] - ip[8];
608 d1 = ip[0] - ip[12];
609 op[0] = (a1 + b1 + 1) >> 1;
610 op[4] = (c1 + d1) >> 1;
611 op[8] = (a1 - b1) >> 1;
612 op[12] = (d1 - c1) >> 1;
613 ip++;
614 op++;
615 }
616 }
617
618 void vp9_short_inv_walsh4x4_1_c(short *in, short *out) {
619 int i;
620 short tmp[4];
621 short *ip = in;
622 short *op = tmp;
623
624 op[0] = (ip[0] + 1) >> 1;
625 op[1] = op[2] = op[3] = (ip[0] >> 1);
626
627 ip = tmp;
628 op = out;
629 for (i = 0; i < 4; i++) {
630 op[0] = (ip[0] + 1) >> 1;
631 op[4] = op[8] = op[12] = (ip[0] >> 1);
632 ip++;
633 op++;
634 }
635 }
636
637 #if CONFIG_LOSSLESS
638 void vp9_short_inv_walsh4x4_lossless_c(short *input, short *output) {
639 int i;
640 int a1, b1, c1, d1;
641 short *ip = input;
642 short *op = output;
643
644 for (i = 0; i < 4; i++) {
645 a1 = ((ip[0] + ip[3])) >> Y2_WHT_UPSCALE_FACTOR;
646 b1 = ((ip[1] + ip[2])) >> Y2_WHT_UPSCALE_FACTOR;
647 c1 = ((ip[1] - ip[2])) >> Y2_WHT_UPSCALE_FACTOR;
648 d1 = ((ip[0] - ip[3])) >> Y2_WHT_UPSCALE_FACTOR;
649
650 op[0] = (a1 + b1 + 1) >> 1;
651 op[1] = (c1 + d1) >> 1;
652 op[2] = (a1 - b1) >> 1;
653 op[3] = (d1 - c1) >> 1;
654
655 ip += 4;
656 op += 4;
657 }
658
659 ip = output;
660 op = output;
661 for (i = 0; i < 4; i++) {
662 a1 = ip[0] + ip[12];
663 b1 = ip[4] + ip[8];
664 c1 = ip[4] - ip[8];
665 d1 = ip[0] - ip[12];
666
667
668 op[0] = ((a1 + b1 + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
669 op[4] = ((c1 + d1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
670 op[8] = ((a1 - b1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
671 op[12] = ((d1 - c1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
672
673 ip++;
674 op++;
675 }
676 }
677
678 void vp9_short_inv_walsh4x4_1_lossless_c(short *in, short *out) {
679 int i;
680 short tmp[4];
681 short *ip = in;
682 short *op = tmp;
683
684 op[0] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) + 1) >> 1;
685 op[1] = op[2] = op[3] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) >> 1);
686
687 ip = tmp;
688 op = out;
689 for (i = 0; i < 4; i++) {
690 op[0] = ((ip[0] + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
691 op[4] = op[8] = op[12] = ((ip[0] >> 1)) << Y2_WHT_UPSCALE_FACTOR;
692 ip++;
693 op++;
694 }
695 }
696
697 void vp9_short_inv_walsh4x4_x8_c(short *input, short *output, int pitch) {
698 int i;
699 int a1, b1, c1, d1;
700 short *ip = input;
701 short *op = output;
702 int shortpitch = pitch >> 1;
703
704 for (i = 0; i < 4; i++) {
705 a1 = ((ip[0] + ip[3])) >> WHT_UPSCALE_FACTOR;
706 b1 = ((ip[1] + ip[2])) >> WHT_UPSCALE_FACTOR;
707 c1 = ((ip[1] - ip[2])) >> WHT_UPSCALE_FACTOR;
708 d1 = ((ip[0] - ip[3])) >> WHT_UPSCALE_FACTOR;
709
710 op[0] = (a1 + b1 + 1) >> 1;
711 op[1] = (c1 + d1) >> 1;
712 op[2] = (a1 - b1) >> 1;
713 op[3] = (d1 - c1) >> 1;
714
715 ip += 4;
716 op += shortpitch;
717 }
718
719 ip = output;
720 op = output;
721 for (i = 0; i < 4; i++) {
722 a1 = ip[shortpitch * 0] + ip[shortpitch * 3];
723 b1 = ip[shortpitch * 1] + ip[shortpitch * 2];
724 c1 = ip[shortpitch * 1] - ip[shortpitch * 2];
725 d1 = ip[shortpitch * 0] - ip[shortpitch * 3];
726
727
728 op[shortpitch * 0] = (a1 + b1 + 1) >> 1;
729 op[shortpitch * 1] = (c1 + d1) >> 1;
730 op[shortpitch * 2] = (a1 - b1) >> 1;
731 op[shortpitch * 3] = (d1 - c1) >> 1;
732
733 ip++;
734 op++;
735 }
736 }
737
738 void vp9_short_inv_walsh4x4_1_x8_c(short *in, short *out, int pitch) {
739 int i;
740 short tmp[4];
741 short *ip = in;
742 short *op = tmp;
743 int shortpitch = pitch >> 1;
744
745 op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1;
746 op[1] = op[2] = op[3] = ((ip[0] >> WHT_UPSCALE_FACTOR) >> 1);
747
748
749 ip = tmp;
750 op = out;
751 for (i = 0; i < 4; i++) {
752 op[shortpitch * 0] = (ip[0] + 1) >> 1;
753 op[shortpitch * 1] = op[shortpitch * 2] = op[shortpitch * 3] = ip[0] >> 1;
754 ip++;
755 op++;
756 }
757 }
758
759 void vp9_dc_only_inv_walsh_add_c(short input_dc, unsigned char *pred_ptr,
760 unsigned char *dst_ptr,
761 int pitch, int stride) {
762 int r, c;
763 short tmp[16];
764 vp9_short_inv_walsh4x4_1_x8_c(&input_dc, tmp, 4 << 1);
765
766 for (r = 0; r < 4; r++) {
767 for (c = 0; c < 4; c++) {
768 int a = tmp[r * 4 + c] + pred_ptr[c];
769 if (a < 0)
770 a = 0;
771
772 if (a > 255)
773 a = 255;
774
775 dst_ptr[c] = (unsigned char) a;
776 }
777
778 dst_ptr += stride;
779 pred_ptr += pitch;
780 }
781 }
782 #endif
783
784 void vp9_dc_only_idct_add_8x8_c(short input_dc,
785 unsigned char *pred_ptr,
786 unsigned char *dst_ptr,
787 int pitch, int stride) {
788 int a1 = ((input_dc + 16) >> 5);
789 int r, c, b;
790 unsigned char *orig_pred = pred_ptr;
791 unsigned char *orig_dst = dst_ptr;
792 for (b = 0; b < 4; b++) {
793 for (r = 0; r < 4; r++) {
794 for (c = 0; c < 4; c++) {
795 int a = a1 + pred_ptr[c];
796
797 if (a < 0)
798 a = 0;
799
800 if (a > 255)
801 a = 255;
802
803 dst_ptr[c] = (unsigned char) a;
804 }
805
806 dst_ptr += stride;
807 pred_ptr += pitch;
808 }
809 dst_ptr = orig_dst + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * stride;
810 pred_ptr = orig_pred + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * pitch;
811 }
812 }
813
814 #define W1 2841 /* 2048*sqrt(2)*cos(1*pi/16) */
815 #define W2 2676 /* 2048*sqrt(2)*cos(2*pi/16) */
816 #define W3 2408 /* 2048*sqrt(2)*cos(3*pi/16) */
817 #define W5 1609 /* 2048*sqrt(2)*cos(5*pi/16) */
818 #define W6 1108 /* 2048*sqrt(2)*cos(6*pi/16) */
819 #define W7 565 /* 2048*sqrt(2)*cos(7*pi/16) */
820
821 /* row (horizontal) IDCT
822 *
823 * 7 pi 1 dst[k] = sum c[l] * src[l] * cos( -- *
824 * ( k + - ) * l ) l=0 8 2
825 *
826 * where: c[0] = 128 c[1..7] = 128*sqrt(2) */
827
828 static void idctrow(int *blk) {
829 int x0, x1, x2, x3, x4, x5, x6, x7, x8;
830 /* shortcut */
831 if (!((x1 = blk[4] << 11) | (x2 = blk[6]) | (x3 = blk[2]) |
832 (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) {
833 blk[0] = blk[1] = blk[2] = blk[3] = blk[4]
834 = blk[5] = blk[6] = blk[7] = blk[0] << 3 ;
835 return;
836 }
837
838 x0 = (blk[0] << 11) + 128; /* for proper rounding in the fourth stage */
839 /* first stage */
840 x8 = W7 * (x4 + x5);
841 x4 = x8 + (W1 - W7) * x4;
842 x5 = x8 - (W1 + W7) * x5;
843 x8 = W3 * (x6 + x7);
844 x6 = x8 - (W3 - W5) * x6;
845 x7 = x8 - (W3 + W5) * x7;
846
847 /* second stage */
848 x8 = x0 + x1;
849 x0 -= x1;
850 x1 = W6 * (x3 + x2);
851 x2 = x1 - (W2 + W6) * x2;
852 x3 = x1 + (W2 - W6) * x3;
853 x1 = x4 + x6;
854 x4 -= x6;
855 x6 = x5 + x7;
856 x5 -= x7;
857
858 /* third stage */
859 x7 = x8 + x3;
860 x8 -= x3;
861 x3 = x0 + x2;
862 x0 -= x2;
863 x2 = (181 * (x4 + x5) + 128) >> 8;
864 x4 = (181 * (x4 - x5) + 128) >> 8;
865
866 /* fourth stage */
867 blk[0] = (x7 + x1) >> 8;
868 blk[1] = (x3 + x2) >> 8;
869 blk[2] = (x0 + x4) >> 8;
870 blk[3] = (x8 + x6) >> 8;
871 blk[4] = (x8 - x6) >> 8;
872 blk[5] = (x0 - x4) >> 8;
873 blk[6] = (x3 - x2) >> 8;
874 blk[7] = (x7 - x1) >> 8;
875 }
876
877 /* column (vertical) IDCT
878 *
879 * 7 pi 1 dst[8*k] = sum c[l] * src[8*l] *
880 * cos( -- * ( k + - ) * l ) l=0 8 2
881 *
882 * where: c[0] = 1/1024 c[1..7] = (1/1024)*sqrt(2) */
883 static void idctcol(int *blk) {
884 int x0, x1, x2, x3, x4, x5, x6, x7, x8;
885
886 /* shortcut */
887 if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) |
888 (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) |
889 (x7 = blk[8 * 3]))) {
890 blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3]
891 = blk[8 * 4] = blk[8 * 5] = blk[8 * 6 ]
892 = blk[8 * 7] = ((blk[8 * 0] + 32) >> 6);
893 return;
894 }
895
896 x0 = (blk[8 * 0] << 8) + 16384;
897
898 /* first stage */
899 x8 = W7 * (x4 + x5) + 4;
900 x4 = (x8 + (W1 - W7) * x4) >> 3;
901 x5 = (x8 - (W1 + W7) * x5) >> 3;
902 x8 = W3 * (x6 + x7) + 4;
903 x6 = (x8 - (W3 - W5) * x6) >> 3;
904 x7 = (x8 - (W3 + W5) * x7) >> 3;
905
906 /* second stage */
907 x8 = x0 + x1;
908 x0 -= x1;
909 x1 = W6 * (x3 + x2) + 4;
910 x2 = (x1 - (W2 + W6) * x2) >> 3;
911 x3 = (x1 + (W2 - W6) * x3) >> 3;
912 x1 = x4 + x6;
913 x4 -= x6;
914 x6 = x5 + x7;
915 x5 -= x7;
916
917 /* third stage */
918 x7 = x8 + x3;
919 x8 -= x3;
920 x3 = x0 + x2;
921 x0 -= x2;
922 x2 = (181 * (x4 + x5) + 128) >> 8;
923 x4 = (181 * (x4 - x5) + 128) >> 8;
924
925 /* fourth stage */
926 blk[8 * 0] = (x7 + x1) >> 14;
927 blk[8 * 1] = (x3 + x2) >> 14;
928 blk[8 * 2] = (x0 + x4) >> 14;
929 blk[8 * 3] = (x8 + x6) >> 14;
930 blk[8 * 4] = (x8 - x6) >> 14;
931 blk[8 * 5] = (x0 - x4) >> 14;
932 blk[8 * 6] = (x3 - x2) >> 14;
933 blk[8 * 7] = (x7 - x1) >> 14;
934 }
935
936 #define TX_DIM 8
937 void vp9_short_idct8x8_c(short *coefs, short *block, int pitch) {
938 int X[TX_DIM * TX_DIM];
939 int i, j;
940 int shortpitch = pitch >> 1;
941
942 for (i = 0; i < TX_DIM; i++) {
943 for (j = 0; j < TX_DIM; j++) {
944 X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1
945 + (coefs[i * TX_DIM + j] < 0)) >> 2;
946 }
947 }
948 for (i = 0; i < 8; i++)
949 idctrow(X + 8 * i);
950
951 for (i = 0; i < 8; i++)
952 idctcol(X + i);
953
954 for (i = 0; i < TX_DIM; i++) {
955 for (j = 0; j < TX_DIM; j++) {
956 block[i * shortpitch + j] = X[i * TX_DIM + j] >> 1;
957 }
958 }
959 }
960
961 /* Row IDCT when only first 4 coefficients are non-zero. */
962 static void idctrow10(int *blk) {
963 int x0, x1, x2, x3, x4, x5, x6, x7, x8;
964
965 /* shortcut */
966 if (!((x1 = blk[4] << 11) | (x2 = blk[6]) | (x3 = blk[2]) |
967 (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) {
968 blk[0] = blk[1] = blk[2] = blk[3] = blk[4]
969 = blk[5] = blk[6] = blk[7] = blk[0] << 3;
970 return;
971 }
972
973 x0 = (blk[0] << 11) + 128; /* for proper rounding in the fourth stage */
974 /* first stage */
975 x5 = W7 * x4;
976 x4 = W1 * x4;
977 x6 = W3 * x7;
978 x7 = -W5 * x7;
979
980 /* second stage */
981 x2 = W6 * x3;
982 x3 = W2 * x3;
983 x1 = x4 + x6;
984 x4 -= x6;
985 x6 = x5 + x7;
986 x5 -= x7;
987
988 /* third stage */
989 x7 = x0 + x3;
990 x8 = x0 - x3;
991 x3 = x0 + x2;
992 x0 -= x2;
993 x2 = (181 * (x4 + x5) + 128) >> 8;
994 x4 = (181 * (x4 - x5) + 128) >> 8;
995
996 /* fourth stage */
997 blk[0] = (x7 + x1) >> 8;
998 blk[1] = (x3 + x2) >> 8;
999 blk[2] = (x0 + x4) >> 8;
1000 blk[3] = (x8 + x6) >> 8;
1001 blk[4] = (x8 - x6) >> 8;
1002 blk[5] = (x0 - x4) >> 8;
1003 blk[6] = (x3 - x2) >> 8;
1004 blk[7] = (x7 - x1) >> 8;
1005 }
1006
1007 /* Column (vertical) IDCT when only first 4 coefficients are non-zero. */
1008 static void idctcol10(int *blk) {
1009 int x0, x1, x2, x3, x4, x5, x6, x7, x8;
1010
1011 /* shortcut */
1012 if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) |
1013 (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) |
1014 (x7 = blk[8 * 3]))) {
1015 blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3]
1016 = blk[8 * 4] = blk[8 * 5] = blk[8 * 6]
1017 = blk[8 * 7] = ((blk[8 * 0] + 32) >> 6);
1018 return;
1019 }
1020
1021 x0 = (blk[8 * 0] << 8) + 16384;
1022
1023 /* first stage */
1024 x5 = (W7 * x4 + 4) >> 3;
1025 x4 = (W1 * x4 + 4) >> 3;
1026 x6 = (W3 * x7 + 4) >> 3;
1027 x7 = (-W5 * x7 + 4) >> 3;
1028
1029 /* second stage */
1030 x2 = (W6 * x3 + 4) >> 3;
1031 x3 = (W2 * x3 + 4) >> 3;
1032 x1 = x4 + x6;
1033 x4 -= x6;
1034 x6 = x5 + x7;
1035 x5 -= x7;
1036
1037 /* third stage */
1038 x7 = x0 + x3;
1039 x8 = x0 - x3;
1040 x3 = x0 + x2;
1041 x0 -= x2;
1042 x2 = (181 * (x4 + x5) + 128) >> 8;
1043 x4 = (181 * (x4 - x5) + 128) >> 8;
1044
1045 /* fourth stage */
1046 blk[8 * 0] = (x7 + x1) >> 14;
1047 blk[8 * 1] = (x3 + x2) >> 14;
1048 blk[8 * 2] = (x0 + x4) >> 14;
1049 blk[8 * 3] = (x8 + x6) >> 14;
1050 blk[8 * 4] = (x8 - x6) >> 14;
1051 blk[8 * 5] = (x0 - x4) >> 14;
1052 blk[8 * 6] = (x3 - x2) >> 14;
1053 blk[8 * 7] = (x7 - x1) >> 14;
1054 }
1055
1056 void vp9_short_idct10_8x8_c(short *coefs, short *block, int pitch) {
1057 int X[TX_DIM * TX_DIM];
1058 int i, j;
1059 int shortpitch = pitch >> 1;
1060
1061 for (i = 0; i < TX_DIM; i++) {
1062 for (j = 0; j < TX_DIM; j++) {
1063 X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1
1064 + (coefs[i * TX_DIM + j] < 0)) >> 2;
1065 }
1066 }
1067
1068 /* Do first 4 row idct only since non-zero dct coefficients are all in
1069 * upper-left 4x4 area. */
1070 for (i = 0; i < 4; i++)
1071 idctrow10(X + 8 * i);
1072
1073 for (i = 0; i < 8; i++)
1074 idctcol10(X + i);
1075
1076 for (i = 0; i < TX_DIM; i++) {
1077 for (j = 0; j < TX_DIM; j++) {
1078 block[i * shortpitch + j] = X[i * TX_DIM + j] >> 1;
1079 }
1080 }
1081 }
1082
1083 void vp9_short_ihaar2x2_c(short *input, short *output, int pitch) {
1084 int i;
1085 short *ip = input; // 0,1, 4, 8
1086 short *op = output;
1087 for (i = 0; i < 16; i++) {
1088 op[i] = 0;
1089 }
1090
1091 op[0] = (ip[0] + ip[1] + ip[4] + ip[8] + 1) >> 1;
1092 op[1] = (ip[0] - ip[1] + ip[4] - ip[8]) >> 1;
1093 op[4] = (ip[0] + ip[1] - ip[4] - ip[8]) >> 1;
1094 op[8] = (ip[0] - ip[1] - ip[4] + ip[8]) >> 1;
1095 }
1096
1097
1098 #if 0
1099 // Keep a really bad float version as reference for now.
1100 void vp9_short_idct16x16_c(short *input, short *output, int pitch) {
1101
1102 vp9_clear_system_state(); // Make it simd safe : __asm emms;
1103 {
1104 double x;
1105 const int short_pitch = pitch >> 1;
1106 int i, j, k, l;
1107 for (l = 0; l < 16; ++l) {
1108 for (k = 0; k < 16; ++k) {
1109 double s = 0;
1110 for (i = 0; i < 16; ++i) {
1111 for (j = 0; j < 16; ++j) {
1112 x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/32;
1113 if (i != 0)
1114 x *= sqrt(2.0);
1115 if (j != 0)
1116 x *= sqrt(2.0);
1117 s += x;
1118 }
1119 }
1120 output[k*short_pitch+l] = (short)round(s);
1121 }
1122 }
1123 }
1124 vp9_clear_system_state(); // Make it simd safe : __asm emms;
1125 }
1126 #endif
1127
1128 #define TEST_INT_16x16_IDCT 1
1129 #if !TEST_INT_16x16_IDCT
1130 static const double C1 = 0.995184726672197;
1131 static const double C2 = 0.98078528040323;
1132 static const double C3 = 0.956940335732209;
1133 static const double C4 = 0.923879532511287;
1134 static const double C5 = 0.881921264348355;
1135 static const double C6 = 0.831469612302545;
1136 static const double C7 = 0.773010453362737;
1137 static const double C8 = 0.707106781186548;
1138 static const double C9 = 0.634393284163646;
1139 static const double C10 = 0.555570233019602;
1140 static const double C11 = 0.471396736825998;
1141 static const double C12 = 0.38268343236509;
1142 static const double C13 = 0.290284677254462;
1143 static const double C14 = 0.195090322016128;
1144 static const double C15 = 0.098017140329561;
1145
1146
1147 static void butterfly_16x16_idct_1d(double input[16], double output[16]) {
1148
1149 vp9_clear_system_state(); // Make it simd safe : __asm emms;
1150 {
1151 double step[16];
1152 double intermediate[16];
1153 double temp1, temp2;
1154
1155
1156 // step 1 and 2
1157 step[ 0] = input[0] + input[8];
1158 step[ 1] = input[0] - input[8];
1159
1160 temp1 = input[4]*C12;
1161 temp2 = input[12]*C4;
1162
1163 temp1 -= temp2;
1164 temp1 *= C8;
1165
1166 step[ 2] = 2*(temp1);
1167
1168 temp1 = input[4]*C4;
1169 temp2 = input[12]*C12;
1170 temp1 += temp2;
1171 temp1 = (temp1);
1172 temp1 *= C8;
1173 step[ 3] = 2*(temp1);
1174
1175 temp1 = input[2]*C8;
1176 temp1 = 2*(temp1);
1177 temp2 = input[6] + input[10];
1178
1179 step[ 4] = temp1 + temp2;
1180 step[ 5] = temp1 - temp2;
1181
1182 temp1 = input[14]*C8;
1183 temp1 = 2*(temp1);
1184 temp2 = input[6] - input[10];
1185
1186 step[ 6] = temp2 - temp1;
1187 step[ 7] = temp2 + temp1;
1188
1189 // for odd input
1190 temp1 = input[3]*C12;
1191 temp2 = input[13]*C4;
1192 temp1 += temp2;
1193 temp1 = (temp1);
1194 temp1 *= C8;
1195 intermediate[ 8] = 2*(temp1);
1196
1197 temp1 = input[3]*C4;
1198 temp2 = input[13]*C12;
1199 temp2 -= temp1;
1200 temp2 = (temp2);
1201 temp2 *= C8;
1202 intermediate[ 9] = 2*(temp2);
1203
1204 intermediate[10] = 2*(input[9]*C8);
1205 intermediate[11] = input[15] - input[1];
1206 intermediate[12] = input[15] + input[1];
1207 intermediate[13] = 2*((input[7]*C8));
1208
1209 temp1 = input[11]*C12;
1210 temp2 = input[5]*C4;
1211 temp2 -= temp1;
1212 temp2 = (temp2);
1213 temp2 *= C8;
1214 intermediate[14] = 2*(temp2);
1215
1216 temp1 = input[11]*C4;
1217 temp2 = input[5]*C12;
1218 temp1 += temp2;
1219 temp1 = (temp1);
1220 temp1 *= C8;
1221 intermediate[15] = 2*(temp1);
1222
1223 step[ 8] = intermediate[ 8] + intermediate[14];
1224 step[ 9] = intermediate[ 9] + intermediate[15];
1225 step[10] = intermediate[10] + intermediate[11];
1226 step[11] = intermediate[10] - intermediate[11];
1227 step[12] = intermediate[12] + intermediate[13];
1228 step[13] = intermediate[12] - intermediate[13];
1229 step[14] = intermediate[ 8] - intermediate[14];
1230 step[15] = intermediate[ 9] - intermediate[15];
1231
1232 // step 3
1233 output[0] = step[ 0] + step[ 3];
1234 output[1] = step[ 1] + step[ 2];
1235 output[2] = step[ 1] - step[ 2];
1236 output[3] = step[ 0] - step[ 3];
1237
1238 temp1 = step[ 4]*C14;
1239 temp2 = step[ 7]*C2;
1240 temp1 -= temp2;
1241 output[4] = (temp1);
1242
1243 temp1 = step[ 4]*C2;
1244 temp2 = step[ 7]*C14;
1245 temp1 += temp2;
1246 output[7] = (temp1);
1247
1248 temp1 = step[ 5]*C10;
1249 temp2 = step[ 6]*C6;
1250 temp1 -= temp2;
1251 output[5] = (temp1);
1252
1253 temp1 = step[ 5]*C6;
1254 temp2 = step[ 6]*C10;
1255 temp1 += temp2;
1256 output[6] = (temp1);
1257
1258 output[8] = step[ 8] + step[11];
1259 output[9] = step[ 9] + step[10];
1260 output[10] = step[ 9] - step[10];
1261 output[11] = step[ 8] - step[11];
1262 output[12] = step[12] + step[15];
1263 output[13] = step[13] + step[14];
1264 output[14] = step[13] - step[14];
1265 output[15] = step[12] - step[15];
1266
1267 // output 4
1268 step[ 0] = output[0] + output[7];
1269 step[ 1] = output[1] + output[6];
1270 step[ 2] = output[2] + output[5];
1271 step[ 3] = output[3] + output[4];
1272 step[ 4] = output[3] - output[4];
1273 step[ 5] = output[2] - output[5];
1274 step[ 6] = output[1] - output[6];
1275 step[ 7] = output[0] - output[7];
1276
1277 temp1 = output[8]*C7;
1278 temp2 = output[15]*C9;
1279 temp1 -= temp2;
1280 step[ 8] = (temp1);
1281
1282 temp1 = output[9]*C11;
1283 temp2 = output[14]*C5;
1284 temp1 += temp2;
1285 step[ 9] = (temp1);
1286
1287 temp1 = output[10]*C3;
1288 temp2 = output[13]*C13;
1289 temp1 -= temp2;
1290 step[10] = (temp1);
1291
1292 temp1 = output[11]*C15;
1293 temp2 = output[12]*C1;
1294 temp1 += temp2;
1295 step[11] = (temp1);
1296
1297 temp1 = output[11]*C1;
1298 temp2 = output[12]*C15;
1299 temp2 -= temp1;
1300 step[12] = (temp2);
1301
1302 temp1 = output[10]*C13;
1303 temp2 = output[13]*C3;
1304 temp1 += temp2;
1305 step[13] = (temp1);
1306
1307 temp1 = output[9]*C5;
1308 temp2 = output[14]*C11;
1309 temp2 -= temp1;
1310 step[14] = (temp2);
1311
1312 temp1 = output[8]*C9;
1313 temp2 = output[15]*C7;
1314 temp1 += temp2;
1315 step[15] = (temp1);
1316
1317 // step 5
1318 output[0] = (step[0] + step[15]);
1319 output[1] = (step[1] + step[14]);
1320 output[2] = (step[2] + step[13]);
1321 output[3] = (step[3] + step[12]);
1322 output[4] = (step[4] + step[11]);
1323 output[5] = (step[5] + step[10]);
1324 output[6] = (step[6] + step[ 9]);
1325 output[7] = (step[7] + step[ 8]);
1326
1327 output[15] = (step[0] - step[15]);
1328 output[14] = (step[1] - step[14]);
1329 output[13] = (step[2] - step[13]);
1330 output[12] = (step[3] - step[12]);
1331 output[11] = (step[4] - step[11]);
1332 output[10] = (step[5] - step[10]);
1333 output[9] = (step[6] - step[ 9]);
1334 output[8] = (step[7] - step[ 8]);
1335 }
1336 vp9_clear_system_state(); // Make it simd safe : __asm emms;
1337 }
1338
1339 // Remove once an int version of iDCT is written
1340 #if 0
1341 void reference_16x16_idct_1d(double input[16], double output[16]) {
1342
1343 vp9_clear_system_state(); // Make it simd safe : __asm emms;
1344 {
1345 const double kPi = 3.141592653589793238462643383279502884;
1346 const double kSqrt2 = 1.414213562373095048801688724209698;
1347 for (int k = 0; k < 16; k++) {
1348 output[k] = 0.0;
1349 for (int n = 0; n < 16; n++) {
1350 output[k] += input[n]*cos(kPi*(2*k+1)*n/32.0);
1351 if (n == 0)
1352 output[k] = output[k]/kSqrt2;
1353 }
1354 }
1355 }
1356 vp9_clear_system_state(); // Make it simd safe : __asm emms;
1357 }
1358 #endif
1359
1360 void vp9_short_idct16x16_c(short *input, short *output, int pitch) {
1361
1362 vp9_clear_system_state(); // Make it simd safe : __asm emms;
1363 {
1364 double out[16*16], out2[16*16];
1365 const int short_pitch = pitch >> 1;
1366 int i, j;
1367 // First transform rows
1368 for (i = 0; i < 16; ++i) {
1369 double temp_in[16], temp_out[16];
1370 for (j = 0; j < 16; ++j)
1371 temp_in[j] = input[j + i*short_pitch];
1372 butterfly_16x16_idct_1d(temp_in, temp_out);
1373 for (j = 0; j < 16; ++j)
1374 out[j + i*16] = temp_out[j];
1375 }
1376 // Then transform columns
1377 for (i = 0; i < 16; ++i) {
1378 double temp_in[16], temp_out[16];
1379 for (j = 0; j < 16; ++j)
1380 temp_in[j] = out[j*16 + i];
1381 butterfly_16x16_idct_1d(temp_in, temp_out);
1382 for (j = 0; j < 16; ++j)
1383 out2[j*16 + i] = temp_out[j];
1384 }
1385 for (i = 0; i < 16*16; ++i)
1386 output[i] = round(out2[i]/128);
1387 }
1388 vp9_clear_system_state(); // Make it simd safe : __asm emms;
1389 }
1390
1391 #else
1392 static const int16_t C1 = 16305;
1393 static const int16_t C2 = 16069;
1394 static const int16_t C3 = 15679;
1395 static const int16_t C4 = 15137;
1396 static const int16_t C5 = 14449;
1397 static const int16_t C6 = 13623;
1398 static const int16_t C7 = 12665;
1399 static const int16_t C8 = 11585;
1400 static const int16_t C9 = 10394;
1401 static const int16_t C10 = 9102;
1402 static const int16_t C11 = 7723;
1403 static const int16_t C12 = 6270;
1404 static const int16_t C13 = 4756;
1405 static const int16_t C14 = 3196;
1406 static const int16_t C15 = 1606;
1407
1408 #define INITIAL_SHIFT 2
1409 #define INITIAL_ROUNDING (1 << (INITIAL_SHIFT - 1))
1410 #define RIGHT_SHIFT 14
1411 #define RIGHT_ROUNDING (1 << (RIGHT_SHIFT - 1))
1412
1413 static void butterfly_16x16_idct_1d(int16_t input[16], int16_t output[16],
1414 int last_shift_bits) {
1415 int16_t step[16];
1416 int intermediate[16];
1417 int temp1, temp2;
1418
1419 int step1_shift = RIGHT_SHIFT + INITIAL_SHIFT;
1420 int step1_rounding = 1 << (step1_shift - 1);
1421 int last_rounding = 0;
1422
1423 if (last_shift_bits > 0)
1424 last_rounding = 1 << (last_shift_bits - 1);
1425
1426 // step 1 and 2
1427 step[ 0] = (input[0] + input[8] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
1428 step[ 1] = (input[0] - input[8] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
1429
1430 temp1 = input[4] * C12;
1431 temp2 = input[12] * C4;
1432 temp1 = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1433 temp1 *= C8;
1434 step[ 2] = (2 * (temp1) + step1_rounding) >> step1_shift;
1435
1436 temp1 = input[4] * C4;
1437 temp2 = input[12] * C12;
1438 temp1 = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1439 temp1 *= C8;
1440 step[ 3] = (2 * (temp1) + step1_rounding) >> step1_shift;
1441
1442 temp1 = input[2] * C8;
1443 temp1 = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1444 temp2 = input[6] + input[10];
1445 step[ 4] = (temp1 + temp2 + INITIAL_ROUNDING) >> INITIAL_SHIFT;
1446 step[ 5] = (temp1 - temp2 + INITIAL_ROUNDING) >> INITIAL_SHIFT;
1447
1448 temp1 = input[14] * C8;
1449 temp1 = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1450 temp2 = input[6] - input[10];
1451 step[ 6] = (temp2 - temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT;
1452 step[ 7] = (temp2 + temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT;
1453
1454 // for odd input
1455 temp1 = input[3] * C12;
1456 temp2 = input[13] * C4;
1457 temp1 = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1458 temp1 *= C8;
1459 intermediate[ 8] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1460
1461 temp1 = input[3] * C4;
1462 temp2 = input[13] * C12;
1463 temp2 = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1464 temp2 *= C8;
1465 intermediate[ 9] = (2 * (temp2) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1466
1467 intermediate[10] = (2 * (input[9] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1468 intermediate[11] = input[15] - input[1];
1469 intermediate[12] = input[15] + input[1];
1470 intermediate[13] = (2 * (input[7] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1471
1472 temp1 = input[11] * C12;
1473 temp2 = input[5] * C4;
1474 temp2 = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1475 temp2 *= C8;
1476 intermediate[14] = (2 * (temp2) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1477
1478 temp1 = input[11] * C4;
1479 temp2 = input[5] * C12;
1480 temp1 = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1481 temp1 *= C8;
1482 intermediate[15] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1483
1484 step[ 8] = (intermediate[ 8] + intermediate[14] + INITIAL_ROUNDING)
1485 >> INITIAL_SHIFT;
1486 step[ 9] = (intermediate[ 9] + intermediate[15] + INITIAL_ROUNDING)
1487 >> INITIAL_SHIFT;
1488 step[10] = (intermediate[10] + intermediate[11] + INITIAL_ROUNDING)
1489 >> INITIAL_SHIFT;
1490 step[11] = (intermediate[10] - intermediate[11] + INITIAL_ROUNDING)
1491 >> INITIAL_SHIFT;
1492 step[12] = (intermediate[12] + intermediate[13] + INITIAL_ROUNDING)
1493 >> INITIAL_SHIFT;
1494 step[13] = (intermediate[12] - intermediate[13] + INITIAL_ROUNDING)
1495 >> INITIAL_SHIFT;
1496 step[14] = (intermediate[ 8] - intermediate[14] + INITIAL_ROUNDING)
1497 >> INITIAL_SHIFT;
1498 step[15] = (intermediate[ 9] - intermediate[15] + INITIAL_ROUNDING)
1499 >> INITIAL_SHIFT;
1500
1501 // step 3
1502 output[0] = step[ 0] + step[ 3];
1503 output[1] = step[ 1] + step[ 2];
1504 output[2] = step[ 1] - step[ 2];
1505 output[3] = step[ 0] - step[ 3];
1506
1507 temp1 = step[ 4] * C14;
1508 temp2 = step[ 7] * C2;
1509 output[4] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1510
1511 temp1 = step[ 4] * C2;
1512 temp2 = step[ 7] * C14;
1513 output[7] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1514
1515 temp1 = step[ 5] * C10;
1516 temp2 = step[ 6] * C6;
1517 output[5] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1518
1519 temp1 = step[ 5] * C6;
1520 temp2 = step[ 6] * C10;
1521 output[6] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1522
1523 output[8] = step[ 8] + step[11];
1524 output[9] = step[ 9] + step[10];
1525 output[10] = step[ 9] - step[10];
1526 output[11] = step[ 8] - step[11];
1527 output[12] = step[12] + step[15];
1528 output[13] = step[13] + step[14];
1529 output[14] = step[13] - step[14];
1530 output[15] = step[12] - step[15];
1531
1532 // output 4
1533 step[ 0] = output[0] + output[7];
1534 step[ 1] = output[1] + output[6];
1535 step[ 2] = output[2] + output[5];
1536 step[ 3] = output[3] + output[4];
1537 step[ 4] = output[3] - output[4];
1538 step[ 5] = output[2] - output[5];
1539 step[ 6] = output[1] - output[6];
1540 step[ 7] = output[0] - output[7];
1541
1542 temp1 = output[8] * C7;
1543 temp2 = output[15] * C9;
1544 step[ 8] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1545
1546 temp1 = output[9] * C11;
1547 temp2 = output[14] * C5;
1548 step[ 9] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1549
1550 temp1 = output[10] * C3;
1551 temp2 = output[13] * C13;
1552 step[10] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1553
1554 temp1 = output[11] * C15;
1555 temp2 = output[12] * C1;
1556 step[11] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1557
1558 temp1 = output[11] * C1;
1559 temp2 = output[12] * C15;
1560 step[12] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1561
1562 temp1 = output[10] * C13;
1563 temp2 = output[13] * C3;
1564 step[13] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1565
1566 temp1 = output[9] * C5;
1567 temp2 = output[14] * C11;
1568 step[14] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1569
1570 temp1 = output[8] * C9;
1571 temp2 = output[15] * C7;
1572 step[15] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1573
1574 // step 5
1575 output[0] = (step[0] + step[15] + last_rounding) >> last_shift_bits;
1576 output[1] = (step[1] + step[14] + last_rounding) >> last_shift_bits;
1577 output[2] = (step[2] + step[13] + last_rounding) >> last_shift_bits;
1578 output[3] = (step[3] + step[12] + last_rounding) >> last_shift_bits;
1579 output[4] = (step[4] + step[11] + last_rounding) >> last_shift_bits;
1580 output[5] = (step[5] + step[10] + last_rounding) >> last_shift_bits;
1581 output[6] = (step[6] + step[ 9] + last_rounding) >> last_shift_bits;
1582 output[7] = (step[7] + step[ 8] + last_rounding) >> last_shift_bits;
1583
1584 output[15] = (step[0] - step[15] + last_rounding) >> last_shift_bits;
1585 output[14] = (step[1] - step[14] + last_rounding) >> last_shift_bits;
1586 output[13] = (step[2] - step[13] + last_rounding) >> last_shift_bits;
1587 output[12] = (step[3] - step[12] + last_rounding) >> last_shift_bits;
1588 output[11] = (step[4] - step[11] + last_rounding) >> last_shift_bits;
1589 output[10] = (step[5] - step[10] + last_rounding) >> last_shift_bits;
1590 output[9] = (step[6] - step[ 9] + last_rounding) >> last_shift_bits;
1591 output[8] = (step[7] - step[ 8] + last_rounding) >> last_shift_bits;
1592 }
1593
1594 void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {
1595 int16_t out[16 * 16];
1596 int16_t *outptr = &out[0];
1597 const int short_pitch = pitch >> 1;
1598 int i, j;
1599 int16_t temp_in[16], temp_out[16];
1600
1601 // First transform rows
1602 for (i = 0; i < 16; ++i) {
1603 butterfly_16x16_idct_1d(input, outptr, 0);
1604 input += short_pitch;
1605 outptr += 16;
1606 }
1607
1608 // Then transform columns
1609 for (i = 0; i < 16; ++i) {
1610 for (j = 0; j < 16; ++j)
1611 temp_in[j] = out[j * 16 + i];
1612 butterfly_16x16_idct_1d(temp_in, temp_out, 3);
1613 for (j = 0; j < 16; ++j)
1614 output[j * 16 + i] = temp_out[j];
1615 }
1616 }
1617
1618 /* The following function is called when we know the maximum number of non-zero
1619 * dct coefficients is less or equal 10.
1620 */
1621 static void butterfly_16x16_idct10_1d(int16_t input[16], int16_t output[16],
1622 int last_shift_bits) {
1623 int16_t step[16] = {0};
1624 int intermediate[16] = {0};
1625 int temp1, temp2;
1626 int last_rounding = 0;
1627
1628 if (last_shift_bits > 0)
1629 last_rounding = 1 << (last_shift_bits - 1);
1630
1631 // step 1 and 2
1632 step[ 0] = (input[0] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
1633 step[ 1] = (input[0] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
1634
1635 temp1 = (2 * (input[2] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1636 step[ 4] = (temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT;
1637 step[ 5] = (temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT;
1638
1639 // for odd input
1640 temp1 = (input[3] * C12 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1641 temp1 *= C8;
1642 intermediate[ 8] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1643
1644 temp1 = (-input[3] * C4 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1645 temp1 *= C8;
1646 intermediate[ 9] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1647
1648 step[ 8] = (intermediate[ 8] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
1649 step[ 9] = (intermediate[ 9] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
1650 step[10] = (-input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
1651 step[11] = (input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
1652 step[12] = (input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
1653 step[13] = (input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
1654 step[14] = (intermediate[ 8] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
1655 step[15] = (intermediate[ 9] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
1656
1657 // step 3
1658 output[0] = step[ 0];
1659 output[1] = step[ 1];
1660 output[2] = step[ 1];
1661 output[3] = step[ 0];
1662
1663 temp1 = step[ 4] * C14;
1664 output[4] = (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1665
1666 temp1 = step[ 4] * C2;
1667 output[7] = (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1668
1669 temp1 = step[ 5] * C10;
1670 output[5] = (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1671
1672 temp1 = step[ 5] * C6;
1673 output[6] = (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1674
1675 output[8] = step[ 8] + step[11];
1676 output[9] = step[ 9] + step[10];
1677 output[10] = step[ 9] - step[10];
1678 output[11] = step[ 8] - step[11];
1679 output[12] = step[12] + step[15];
1680 output[13] = step[13] + step[14];
1681 output[14] = step[13] - step[14];
1682 output[15] = step[12] - step[15];
1683
1684 // output 4
1685 step[ 0] = output[0] + output[7];
1686 step[ 1] = output[1] + output[6];
1687 step[ 2] = output[2] + output[5];
1688 step[ 3] = output[3] + output[4];
1689 step[ 4] = output[3] - output[4];
1690 step[ 5] = output[2] - output[5];
1691 step[ 6] = output[1] - output[6];
1692 step[ 7] = output[0] - output[7];
1693
1694 temp1 = output[8] * C7;
1695 temp2 = output[15] * C9;
1696 step[ 8] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1697
1698 temp1 = output[9] * C11;
1699 temp2 = output[14] * C5;
1700 step[ 9] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1701
1702 temp1 = output[10] * C3;
1703 temp2 = output[13] * C13;
1704 step[10] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1705
1706 temp1 = output[11] * C15;
1707 temp2 = output[12] * C1;
1708 step[11] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1709
1710 temp1 = output[11] * C1;
1711 temp2 = output[12] * C15;
1712 step[12] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1713
1714 temp1 = output[10] * C13;
1715 temp2 = output[13] * C3;
1716 step[13] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1717
1718 temp1 = output[9] * C5;
1719 temp2 = output[14] * C11;
1720 step[14] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1721
1722 temp1 = output[8] * C9;
1723 temp2 = output[15] * C7;
1724 step[15] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1725
1726 // step 5
1727 output[0] = (step[0] + step[15] + last_rounding) >> last_shift_bits;
1728 output[1] = (step[1] + step[14] + last_rounding) >> last_shift_bits;
1729 output[2] = (step[2] + step[13] + last_rounding) >> last_shift_bits;
1730 output[3] = (step[3] + step[12] + last_rounding) >> last_shift_bits;
1731 output[4] = (step[4] + step[11] + last_rounding) >> last_shift_bits;
1732 output[5] = (step[5] + step[10] + last_rounding) >> last_shift_bits;
1733 output[6] = (step[6] + step[ 9] + last_rounding) >> last_shift_bits;
1734 output[7] = (step[7] + step[ 8] + last_rounding) >> last_shift_bits;
1735
1736 output[15] = (step[0] - step[15] + last_rounding) >> last_shift_bits;
1737 output[14] = (step[1] - step[14] + last_rounding) >> last_shift_bits;
1738 output[13] = (step[2] - step[13] + last_rounding) >> last_shift_bits;
1739 output[12] = (step[3] - step[12] + last_rounding) >> last_shift_bits;
1740 output[11] = (step[4] - step[11] + last_rounding) >> last_shift_bits;
1741 output[10] = (step[5] - step[10] + last_rounding) >> last_shift_bits;
1742 output[9] = (step[6] - step[ 9] + last_rounding) >> last_shift_bits;
1743 output[8] = (step[7] - step[ 8] + last_rounding) >> last_shift_bits;
1744 }
1745
1746 void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) {
1747 int16_t out[16 * 16];
1748 int16_t *outptr = &out[0];
1749 const int short_pitch = pitch >> 1;
1750 int i, j;
1751 int16_t temp_in[16], temp_out[16];
1752
1753 /* First transform rows. Since all non-zero dct coefficients are in
1754 * upper-left 4x4 area, we only need to calculate first 4 rows here.
1755 */
1756 vpx_memset(out, 0, sizeof(out));
1757 for (i = 0; i < 4; ++i) {
1758 butterfly_16x16_idct10_1d(input, outptr, 0);
1759 input += short_pitch;
1760 outptr += 16;
1761 }
1762
1763 // Then transform columns
1764 for (i = 0; i < 16; ++i) {
1765 for (j = 0; j < 16; ++j)
1766 temp_in[j] = out[j*16 + i];
1767 butterfly_16x16_idct10_1d(temp_in, temp_out, 3);
1768 for (j = 0; j < 16; ++j)
1769 output[j*16 + i] = temp_out[j];
1770 }
1771 }
1772 #undef INITIAL_SHIFT
1773 #undef INITIAL_ROUNDING
1774 #undef RIGHT_SHIFT
1775 #undef RIGHT_ROUNDING
1776 #endif
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698