OLD | NEW |
| (Empty) |
1 ; | |
2 ; Copyright (c) 2014 The WebM project authors. All Rights Reserved. | |
3 ; | |
4 ; Use of this source code is governed by a BSD-style license | |
5 ; that can be found in the LICENSE file in the root of the source | |
6 ; tree. An additional intellectual property rights grant can be found | |
7 ; in the file PATENTS. All contributing project authors may | |
8 ; be found in the AUTHORS file in the root of the source tree. | |
9 ; | |
10 %include "third_party/x86inc/x86inc.asm" | |
11 | |
12 ; This file provides SSSE3 version of the inverse transformation. Part | |
13 ; of the functions are originally derived from the ffmpeg project. | |
14 ; Note that the current version applies to x86 64-bit only. | |
15 | |
16 SECTION_RODATA | |
17 | |
18 pw_11585x2: times 8 dw 23170 | |
19 pd_8192: times 4 dd 8192 | |
20 pw_16: times 8 dw 16 | |
21 | |
22 %macro TRANSFORM_COEFFS 2 | |
23 pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2 | |
24 pw_m%2_%1: dw -%2, %1, -%2, %1, -%2, %1, -%2, %1 | |
25 %endmacro | |
26 | |
27 TRANSFORM_COEFFS 6270, 15137 | |
28 TRANSFORM_COEFFS 3196, 16069 | |
29 TRANSFORM_COEFFS 13623, 9102 | |
30 | |
31 %macro PAIR_PP_COEFFS 2 | |
32 dpw_%1_%2: dw %1, %1, %1, %1, %2, %2, %2, %2 | |
33 %endmacro | |
34 | |
35 %macro PAIR_MP_COEFFS 2 | |
36 dpw_m%1_%2: dw -%1, -%1, -%1, -%1, %2, %2, %2, %2 | |
37 %endmacro | |
38 | |
39 %macro PAIR_MM_COEFFS 2 | |
40 dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2 | |
41 %endmacro | |
42 | |
43 PAIR_PP_COEFFS 30274, 12540 | |
44 PAIR_PP_COEFFS 6392, 32138 | |
45 PAIR_MP_COEFFS 18204, 27246 | |
46 | |
47 PAIR_PP_COEFFS 12540, 12540 | |
48 PAIR_PP_COEFFS 30274, 30274 | |
49 PAIR_PP_COEFFS 6392, 6392 | |
50 PAIR_PP_COEFFS 32138, 32138 | |
51 PAIR_MM_COEFFS 18204, 18204 | |
52 PAIR_PP_COEFFS 27246, 27246 | |
53 | |
54 SECTION .text | |
55 | |
56 %if ARCH_X86_64 | |
57 %macro SUM_SUB 3 | |
58 psubw m%3, m%1, m%2 | |
59 paddw m%1, m%2 | |
60 SWAP %2, %3 | |
61 %endmacro | |
62 | |
63 ; butterfly operation | |
64 %macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2 | |
65 pmaddwd m%1, m%3, %5 | |
66 pmaddwd m%2, m%3, %6 | |
67 paddd m%1, %4 | |
68 paddd m%2, %4 | |
69 psrad m%1, 14 | |
70 psrad m%2, 14 | |
71 %endmacro | |
72 | |
73 %macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2 | |
74 punpckhwd m%6, m%2, m%1 | |
75 MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_%3_%4] | |
76 punpcklwd m%2, m%1 | |
77 MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_%3_%4] | |
78 packssdw m%1, m%7 | |
79 packssdw m%2, m%6 | |
80 %endmacro | |
81 | |
82 ; matrix transpose | |
83 %macro INTERLEAVE_2X 4 | |
84 punpckh%1 m%4, m%2, m%3 | |
85 punpckl%1 m%2, m%3 | |
86 SWAP %3, %4 | |
87 %endmacro | |
88 | |
89 %macro TRANSPOSE8X8 9 | |
90 INTERLEAVE_2X wd, %1, %2, %9 | |
91 INTERLEAVE_2X wd, %3, %4, %9 | |
92 INTERLEAVE_2X wd, %5, %6, %9 | |
93 INTERLEAVE_2X wd, %7, %8, %9 | |
94 | |
95 INTERLEAVE_2X dq, %1, %3, %9 | |
96 INTERLEAVE_2X dq, %2, %4, %9 | |
97 INTERLEAVE_2X dq, %5, %7, %9 | |
98 INTERLEAVE_2X dq, %6, %8, %9 | |
99 | |
100 INTERLEAVE_2X qdq, %1, %5, %9 | |
101 INTERLEAVE_2X qdq, %3, %7, %9 | |
102 INTERLEAVE_2X qdq, %2, %6, %9 | |
103 INTERLEAVE_2X qdq, %4, %8, %9 | |
104 | |
105 SWAP %2, %5 | |
106 SWAP %4, %7 | |
107 %endmacro | |
108 | |
109 %macro IDCT8_1D 0 | |
110 SUM_SUB 0, 4, 9 | |
111 BUTTERFLY_4X 2, 6, 6270, 15137, m8, 9, 10 | |
112 pmulhrsw m0, m12 | |
113 pmulhrsw m4, m12 | |
114 BUTTERFLY_4X 1, 7, 3196, 16069, m8, 9, 10 | |
115 BUTTERFLY_4X 5, 3, 13623, 9102, m8, 9, 10 | |
116 | |
117 SUM_SUB 1, 5, 9 | |
118 SUM_SUB 7, 3, 9 | |
119 SUM_SUB 0, 6, 9 | |
120 SUM_SUB 4, 2, 9 | |
121 SUM_SUB 3, 5, 9 | |
122 pmulhrsw m3, m12 | |
123 pmulhrsw m5, m12 | |
124 | |
125 SUM_SUB 0, 7, 9 | |
126 SUM_SUB 4, 3, 9 | |
127 SUM_SUB 2, 5, 9 | |
128 SUM_SUB 6, 1, 9 | |
129 | |
130 SWAP 3, 6 | |
131 SWAP 1, 4 | |
132 %endmacro | |
133 | |
134 ; This macro handles 8 pixels per line | |
135 %macro ADD_STORE_8P_2X 5; src1, src2, tmp1, tmp2, zero | |
136 paddw m%1, m11 | |
137 paddw m%2, m11 | |
138 psraw m%1, 5 | |
139 psraw m%2, 5 | |
140 | |
141 movh m%3, [outputq] | |
142 movh m%4, [outputq + strideq] | |
143 punpcklbw m%3, m%5 | |
144 punpcklbw m%4, m%5 | |
145 paddw m%3, m%1 | |
146 paddw m%4, m%2 | |
147 packuswb m%3, m%5 | |
148 packuswb m%4, m%5 | |
149 movh [outputq], m%3 | |
150 movh [outputq + strideq], m%4 | |
151 %endmacro | |
152 | |
153 INIT_XMM ssse3 | |
154 ; full inverse 8x8 2D-DCT transform | |
155 cglobal idct8x8_64_add, 3, 5, 13, input, output, stride | |
156 mova m8, [pd_8192] | |
157 mova m11, [pw_16] | |
158 mova m12, [pw_11585x2] | |
159 | |
160 lea r3, [2 * strideq] | |
161 | |
162 mova m0, [inputq + 0] | |
163 mova m1, [inputq + 16] | |
164 mova m2, [inputq + 32] | |
165 mova m3, [inputq + 48] | |
166 mova m4, [inputq + 64] | |
167 mova m5, [inputq + 80] | |
168 mova m6, [inputq + 96] | |
169 mova m7, [inputq + 112] | |
170 | |
171 TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 | |
172 IDCT8_1D | |
173 TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 | |
174 IDCT8_1D | |
175 | |
176 pxor m12, m12 | |
177 ADD_STORE_8P_2X 0, 1, 9, 10, 12 | |
178 lea outputq, [outputq + r3] | |
179 ADD_STORE_8P_2X 2, 3, 9, 10, 12 | |
180 lea outputq, [outputq + r3] | |
181 ADD_STORE_8P_2X 4, 5, 9, 10, 12 | |
182 lea outputq, [outputq + r3] | |
183 ADD_STORE_8P_2X 6, 7, 9, 10, 12 | |
184 | |
185 RET | |
186 | |
187 ; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero | |
188 cglobal idct8x8_12_add, 3, 5, 13, input, output, stride | |
189 mova m8, [pd_8192] | |
190 mova m11, [pw_16] | |
191 mova m12, [pw_11585x2] | |
192 | |
193 lea r3, [2 * strideq] | |
194 | |
195 mova m0, [inputq + 0] | |
196 mova m1, [inputq + 16] | |
197 mova m2, [inputq + 32] | |
198 mova m3, [inputq + 48] | |
199 | |
200 punpcklwd m0, m1 | |
201 punpcklwd m2, m3 | |
202 punpckhdq m9, m0, m2 | |
203 punpckldq m0, m2 | |
204 SWAP 2, 9 | |
205 | |
206 ; m0 -> [0], [0] | |
207 ; m1 -> [1], [1] | |
208 ; m2 -> [2], [2] | |
209 ; m3 -> [3], [3] | |
210 punpckhqdq m10, m0, m0 | |
211 punpcklqdq m0, m0 | |
212 punpckhqdq m9, m2, m2 | |
213 punpcklqdq m2, m2 | |
214 SWAP 1, 10 | |
215 SWAP 3, 9 | |
216 | |
217 pmulhrsw m0, m12 | |
218 pmulhrsw m2, [dpw_30274_12540] | |
219 pmulhrsw m1, [dpw_6392_32138] | |
220 pmulhrsw m3, [dpw_m18204_27246] | |
221 | |
222 SUM_SUB 0, 2, 9 | |
223 SUM_SUB 1, 3, 9 | |
224 | |
225 punpcklqdq m9, m3, m3 | |
226 punpckhqdq m5, m3, m9 | |
227 | |
228 SUM_SUB 3, 5, 9 | |
229 punpckhqdq m5, m3 | |
230 pmulhrsw m5, m12 | |
231 | |
232 punpckhqdq m9, m1, m5 | |
233 punpcklqdq m1, m5 | |
234 SWAP 5, 9 | |
235 | |
236 SUM_SUB 0, 5, 9 | |
237 SUM_SUB 2, 1, 9 | |
238 | |
239 punpckhqdq m3, m0, m0 | |
240 punpckhqdq m4, m1, m1 | |
241 punpckhqdq m6, m5, m5 | |
242 punpckhqdq m7, m2, m2 | |
243 | |
244 punpcklwd m0, m3 | |
245 punpcklwd m7, m2 | |
246 punpcklwd m1, m4 | |
247 punpcklwd m6, m5 | |
248 | |
249 punpckhdq m4, m0, m7 | |
250 punpckldq m0, m7 | |
251 punpckhdq m10, m1, m6 | |
252 punpckldq m5, m1, m6 | |
253 | |
254 punpckhqdq m1, m0, m5 | |
255 punpcklqdq m0, m5 | |
256 punpckhqdq m3, m4, m10 | |
257 punpcklqdq m2, m4, m10 | |
258 | |
259 | |
260 pmulhrsw m0, m12 | |
261 pmulhrsw m6, m2, [dpw_30274_30274] | |
262 pmulhrsw m4, m2, [dpw_12540_12540] | |
263 | |
264 pmulhrsw m7, m1, [dpw_32138_32138] | |
265 pmulhrsw m1, [dpw_6392_6392] | |
266 pmulhrsw m5, m3, [dpw_m18204_m18204] | |
267 pmulhrsw m3, [dpw_27246_27246] | |
268 | |
269 mova m2, m0 | |
270 SUM_SUB 0, 6, 9 | |
271 SUM_SUB 2, 4, 9 | |
272 SUM_SUB 1, 5, 9 | |
273 SUM_SUB 7, 3, 9 | |
274 | |
275 SUM_SUB 3, 5, 9 | |
276 pmulhrsw m3, m12 | |
277 pmulhrsw m5, m12 | |
278 | |
279 SUM_SUB 0, 7, 9 | |
280 SUM_SUB 2, 3, 9 | |
281 SUM_SUB 4, 5, 9 | |
282 SUM_SUB 6, 1, 9 | |
283 | |
284 SWAP 3, 6 | |
285 SWAP 1, 2 | |
286 SWAP 2, 4 | |
287 | |
288 | |
289 pxor m12, m12 | |
290 ADD_STORE_8P_2X 0, 1, 9, 10, 12 | |
291 lea outputq, [outputq + r3] | |
292 ADD_STORE_8P_2X 2, 3, 9, 10, 12 | |
293 lea outputq, [outputq + r3] | |
294 ADD_STORE_8P_2X 4, 5, 9, 10, 12 | |
295 lea outputq, [outputq + r3] | |
296 ADD_STORE_8P_2X 6, 7, 9, 10, 12 | |
297 | |
298 RET | |
299 | |
300 %endif | |
OLD | NEW |