Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(134)

Side by Side Diff: source/libvpx/vp9/common/vp9_idctllm.c

Issue 11974002: libvpx: Pull from upstream (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 7 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 11
12 /**************************************************************************** 12 /****************************************************************************
13 * Notes: 13 * Notes:
14 * 14 *
15 * This implementation makes use of 16 bit fixed point verio of two multiply 15 * This implementation makes use of 16 bit fixed point verio of two multiply
16 * constants: 16 * constants:
17 * 1. sqrt(2) * cos (pi/8) 17 * 1. sqrt(2) * cos (pi/8)
18 * 2. sqrt(2) * sin (pi/8) 18 * 2. sqrt(2) * sin (pi/8)
19 * Becuase the first constant is bigger than 1, to maintain the same 16 bit 19 * Becuase the first constant is bigger than 1, to maintain the same 16 bit
20 * fixed point precision as the second one, we use a trick of 20 * fixed point precision as the second one, we use a trick of
21 * x * a = x + x*(a-1) 21 * x * a = x + x*(a-1)
22 * so 22 * so
23 * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). 23 * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
24 **************************************************************************/ 24 **************************************************************************/
25 #include <assert.h> 25 #include <assert.h>
26 #include <math.h> 26 #include <math.h>
27 #include "vpx_ports/config.h" 27 #include "./vpx_config.h"
28 #include "vp9/common/vp9_systemdependent.h" 28 #include "vp9/common/vp9_systemdependent.h"
29
30 #include "vp9/common/vp9_blockd.h" 29 #include "vp9/common/vp9_blockd.h"
30 #include "vp9/common/vp9_common.h"
31 31
32 static const int cospi8sqrt2minus1 = 20091; 32 static const int cospi8sqrt2minus1 = 20091;
33 static const int sinpi8sqrt2 = 35468; 33 static const int sinpi8sqrt2 = 35468;
34 static const int rounding = 0; 34 static const int rounding = 0;
35 35
36 static const int16_t idct_i4[16] = { 36 static const int16_t idct_i4[16] = {
37 8192, 10703, 8192, 4433, 37 8192, 10703, 8192, 4433,
38 8192, 4433, -8192, -10703, 38 8192, 4433, -8192, -10703,
39 8192, -4433, -8192, 10703, 39 8192, -4433, -8192, 10703,
40 8192, -10703, 8192, -4433 40 8192, -10703, 8192, -4433
(...skipping 111 matching lines...) Expand 10 before | Expand all | Expand 10 after
152 5543, -4311, 2120, 542, -3084, 4940, -5698, 5189, 152 5543, -4311, 2120, 542, -3084, 4940, -5698, 5189,
153 -3526, 1080, 1607, -3936, 5390, -5646, 4646, -2614, 153 -3526, 1080, 1607, -3936, 5390, -5646, 4646, -2614,
154 5646, -5189, 4311, -3084, 1607, 0, -1607, 3084, 154 5646, -5189, 4311, -3084, 1607, 0, -1607, 3084,
155 -4311, 5189, -5646, 5646, -5189, 4311, -3084, 1607, 155 -4311, 5189, -5646, 5646, -5189, 4311, -3084, 1607,
156 5698, -5646, 5543, -5390, 5189, -4940, 4646, -4311, 156 5698, -5646, 5543, -5390, 5189, -4940, 4646, -4311,
157 3936, -3526, 3084, -2614, 2120, -1607, 1080, -542 157 3936, -3526, 3084, -2614, 2120, -1607, 1080, -542
158 }; 158 };
159 159
160 160
161 /* Converted the transforms to integer form. */ 161 /* Converted the transforms to integer form. */
162 #define VERTICAL_SHIFT 14 // 16 162 #define HORIZONTAL_SHIFT 14 // 16
163 #define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1)
164 #define VERTICAL_SHIFT 17 // 15
163 #define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1) 165 #define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1)
164 #define HORIZONTAL_SHIFT 17 // 15
165 #define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1)
166 void vp9_ihtllm_c(const int16_t *input, int16_t *output, int pitch, 166 void vp9_ihtllm_c(const int16_t *input, int16_t *output, int pitch,
167 TX_TYPE tx_type, int tx_dim, uint16_t eobs) { 167 TX_TYPE tx_type, int tx_dim, uint16_t eobs) {
168 int i, j, k; 168 int i, j, k;
169 int nz_dim; 169 int nz_dim;
170 int16_t imbuf[256]; 170 int16_t imbuf[256];
171 171
172 const int16_t *ip = input; 172 const int16_t *ip = input;
173 int16_t *op = output; 173 int16_t *op = output;
174 int16_t *im = &imbuf[0]; 174 int16_t *im = &imbuf[0];
175 175
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after
211 vpx_memset(im, 0, 512); 211 vpx_memset(im, 0, 512);
212 nz_dim = 8; 212 nz_dim = 8;
213 if(eobs < 3) { 213 if(eobs < 3) {
214 nz_dim = 2; 214 nz_dim = 2;
215 } else if(eobs < 10) { 215 } else if(eobs < 10) {
216 nz_dim = 4; 216 nz_dim = 4;
217 } 217 }
218 } 218 }
219 } 219 }
220 220
221 /* vertical transformation */ 221 /* 2-D inverse transform X = M1*Z*Transposed_M2 is calculated in 2 steps
222 * from right to left:
223 * 1. horizontal transform: Y= Z*Transposed_M2
224 * 2. vertical transform: X = M1*Y
225 * In SIMD, doing this way could eliminate the transpose needed if it is
226 * calculated from left to right.
227 */
228 /* Horizontal transformation */
222 for (j = 0; j < tx_dim; j++) { 229 for (j = 0; j < tx_dim; j++) {
223 for (i = 0; i < nz_dim; i++) { 230 for (i = 0; i < nz_dim; i++) {
224 int temp = 0; 231 int temp = 0;
225 232
226 for (k = 0; k < nz_dim; k++) { 233 for (k = 0; k < nz_dim; k++) {
227 temp += ptv[k] * ip[(k * tx_dim)]; 234 temp += ip[k] * pth[k];
228 } 235 }
229 236
230 im[i] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT); 237 /* Calculate im and store it in its transposed position. */
231 ip++; 238 im[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT);
239 ip += tx_dim;
232 } 240 }
233 im += tx_dim; // 16 241 im += tx_dim;
234 ptv += tx_dim; 242 pth += tx_dim;
235 ip = input; 243 ip = input;
236 } 244 }
237 245
238 /* horizontal transformation */ 246 /* Vertical transformation */
239 im = &imbuf[0]; 247 im = &imbuf[0];
240 248
241 for (j = 0; j < tx_dim; j++) { 249 for (i = 0; i < tx_dim; i++) {
242 const int16_t *pthc = pth; 250 for (j = 0; j < tx_dim; j++) {
243
244 for (i = 0; i < tx_dim; i++) {
245 int temp = 0; 251 int temp = 0;
246 252
247 for (k = 0; k < nz_dim; k++) { 253 for (k = 0; k < nz_dim; k++) {
248 temp += im[k] * pthc[k]; 254 temp += ptv[k] * im[k];
249 } 255 }
250 256
251 op[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT); 257 op[j] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT);
252 pthc += tx_dim; 258 im += tx_dim;
253 } 259 }
254 260 im = &imbuf[0];
255 im += tx_dim; // 16 261 ptv += tx_dim;
256 op += shortpitch; 262 op += shortpitch;
257 } 263 }
258 } 264 }
259 265
260 void vp9_short_idct4x4llm_c(short *input, short *output, int pitch) { 266 void vp9_short_idct4x4llm_c(int16_t *input, int16_t *output, int pitch) {
261 int i; 267 int i;
262 int a1, b1, c1, d1; 268 int a1, b1, c1, d1;
263 269
264 short *ip = input; 270 int16_t *ip = input;
265 short *op = output; 271 int16_t *op = output;
266 int temp1, temp2; 272 int temp1, temp2;
267 int shortpitch = pitch >> 1; 273 int shortpitch = pitch >> 1;
268 274
269 for (i = 0; i < 4; i++) { 275 for (i = 0; i < 4; i++) {
270 a1 = ip[0] + ip[8]; 276 a1 = ip[0] + ip[8];
271 b1 = ip[0] - ip[8]; 277 b1 = ip[0] - ip[8];
272 278
273 temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16; 279 temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16;
274 temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16); 280 temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16);
275 c1 = temp1 - temp2; 281 c1 = temp1 - temp2;
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after
307 op[3] = (a1 - d1 + 16) >> 5; 313 op[3] = (a1 - d1 + 16) >> 5;
308 314
309 op[1] = (b1 + c1 + 16) >> 5; 315 op[1] = (b1 + c1 + 16) >> 5;
310 op[2] = (b1 - c1 + 16) >> 5; 316 op[2] = (b1 - c1 + 16) >> 5;
311 317
312 ip += shortpitch; 318 ip += shortpitch;
313 op += shortpitch; 319 op += shortpitch;
314 } 320 }
315 } 321 }
316 322
317 void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch) { 323 void vp9_short_idct4x4llm_1_c(int16_t *input, int16_t *output, int pitch) {
318 int i; 324 int i;
319 int a1; 325 int a1;
320 short *op = output; 326 int16_t *op = output;
321 int shortpitch = pitch >> 1; 327 int shortpitch = pitch >> 1;
322 a1 = ((input[0] + 16) >> 5); 328 a1 = ((input[0] + 16) >> 5);
323 for (i = 0; i < 4; i++) { 329 for (i = 0; i < 4; i++) {
324 op[0] = a1; 330 op[0] = a1;
325 op[1] = a1; 331 op[1] = a1;
326 op[2] = a1; 332 op[2] = a1;
327 op[3] = a1; 333 op[3] = a1;
328 op += shortpitch; 334 op += shortpitch;
329 } 335 }
330 } 336 }
331 337
332 void vp9_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, 338 void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr,
333 unsigned char *dst_ptr, int pitch, int stride) { 339 uint8_t *dst_ptr, int pitch, int stride) {
334 int a1 = ((input_dc + 16) >> 5); 340 int a1 = ((input_dc + 16) >> 5);
335 int r, c; 341 int r, c;
336 342
337 for (r = 0; r < 4; r++) { 343 for (r = 0; r < 4; r++) {
338 for (c = 0; c < 4; c++) { 344 for (c = 0; c < 4; c++) {
339 int a = a1 + pred_ptr[c]; 345 dst_ptr[c] = clip_pixel(a1 + pred_ptr[c]);
340
341 if (a < 0)
342 a = 0;
343
344 if (a > 255)
345 a = 255;
346
347 dst_ptr[c] = (unsigned char) a;
348 } 346 }
349 347
350 dst_ptr += stride; 348 dst_ptr += stride;
351 pred_ptr += pitch; 349 pred_ptr += pitch;
352 } 350 }
353 } 351 }
354 352
355 void vp9_short_inv_walsh4x4_c(short *input, short *output) { 353 void vp9_short_inv_walsh4x4_c(int16_t *input, int16_t *output) {
356 int i; 354 int i;
357 int a1, b1, c1, d1; 355 int a1, b1, c1, d1;
358 short *ip = input; 356 int16_t *ip = input;
359 short *op = output; 357 int16_t *op = output;
360 358
361 for (i = 0; i < 4; i++) { 359 for (i = 0; i < 4; i++) {
362 a1 = ((ip[0] + ip[3])); 360 a1 = ((ip[0] + ip[3]));
363 b1 = ((ip[1] + ip[2])); 361 b1 = ((ip[1] + ip[2]));
364 c1 = ((ip[1] - ip[2])); 362 c1 = ((ip[1] - ip[2]));
365 d1 = ((ip[0] - ip[3])); 363 d1 = ((ip[0] - ip[3]));
366 364
367 op[0] = (a1 + b1 + 1) >> 1; 365 op[0] = (a1 + b1 + 1) >> 1;
368 op[1] = (c1 + d1) >> 1; 366 op[1] = (c1 + d1) >> 1;
369 op[2] = (a1 - b1) >> 1; 367 op[2] = (a1 - b1) >> 1;
(...skipping 12 matching lines...) Expand all
382 d1 = ip[0] - ip[12]; 380 d1 = ip[0] - ip[12];
383 op[0] = (a1 + b1 + 1) >> 1; 381 op[0] = (a1 + b1 + 1) >> 1;
384 op[4] = (c1 + d1) >> 1; 382 op[4] = (c1 + d1) >> 1;
385 op[8] = (a1 - b1) >> 1; 383 op[8] = (a1 - b1) >> 1;
386 op[12] = (d1 - c1) >> 1; 384 op[12] = (d1 - c1) >> 1;
387 ip++; 385 ip++;
388 op++; 386 op++;
389 } 387 }
390 } 388 }
391 389
392 void vp9_short_inv_walsh4x4_1_c(short *in, short *out) { 390 void vp9_short_inv_walsh4x4_1_c(int16_t *in, int16_t *out) {
393 int i; 391 int i;
394 short tmp[4]; 392 int16_t tmp[4];
395 short *ip = in; 393 int16_t *ip = in;
396 short *op = tmp; 394 int16_t *op = tmp;
397 395
398 op[0] = (ip[0] + 1) >> 1; 396 op[0] = (ip[0] + 1) >> 1;
399 op[1] = op[2] = op[3] = (ip[0] >> 1); 397 op[1] = op[2] = op[3] = (ip[0] >> 1);
400 398
401 ip = tmp; 399 ip = tmp;
402 op = out; 400 op = out;
403 for (i = 0; i < 4; i++) { 401 for (i = 0; i < 4; i++) {
404 op[0] = (ip[0] + 1) >> 1; 402 op[0] = (ip[0] + 1) >> 1;
405 op[4] = op[8] = op[12] = (ip[0] >> 1); 403 op[4] = op[8] = op[12] = (ip[0] >> 1);
406 ip++; 404 ip++;
407 op++; 405 op++;
408 } 406 }
409 } 407 }
410 408
411 #if CONFIG_LOSSLESS 409 #if CONFIG_LOSSLESS
412 void vp9_short_inv_walsh4x4_lossless_c(short *input, short *output) { 410 void vp9_short_inv_walsh4x4_lossless_c(int16_t *input, int16_t *output) {
413 int i; 411 int i;
414 int a1, b1, c1, d1; 412 int a1, b1, c1, d1;
415 short *ip = input; 413 int16_t *ip = input;
416 short *op = output; 414 int16_t *op = output;
417 415
418 for (i = 0; i < 4; i++) { 416 for (i = 0; i < 4; i++) {
419 a1 = ((ip[0] + ip[3])) >> Y2_WHT_UPSCALE_FACTOR; 417 a1 = ((ip[0] + ip[3])) >> Y2_WHT_UPSCALE_FACTOR;
420 b1 = ((ip[1] + ip[2])) >> Y2_WHT_UPSCALE_FACTOR; 418 b1 = ((ip[1] + ip[2])) >> Y2_WHT_UPSCALE_FACTOR;
421 c1 = ((ip[1] - ip[2])) >> Y2_WHT_UPSCALE_FACTOR; 419 c1 = ((ip[1] - ip[2])) >> Y2_WHT_UPSCALE_FACTOR;
422 d1 = ((ip[0] - ip[3])) >> Y2_WHT_UPSCALE_FACTOR; 420 d1 = ((ip[0] - ip[3])) >> Y2_WHT_UPSCALE_FACTOR;
423 421
424 op[0] = (a1 + b1 + 1) >> 1; 422 op[0] = (a1 + b1 + 1) >> 1;
425 op[1] = (c1 + d1) >> 1; 423 op[1] = (c1 + d1) >> 1;
426 op[2] = (a1 - b1) >> 1; 424 op[2] = (a1 - b1) >> 1;
(...skipping 15 matching lines...) Expand all
442 op[0] = ((a1 + b1 + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR; 440 op[0] = ((a1 + b1 + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
443 op[4] = ((c1 + d1) >> 1) << Y2_WHT_UPSCALE_FACTOR; 441 op[4] = ((c1 + d1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
444 op[8] = ((a1 - b1) >> 1) << Y2_WHT_UPSCALE_FACTOR; 442 op[8] = ((a1 - b1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
445 op[12] = ((d1 - c1) >> 1) << Y2_WHT_UPSCALE_FACTOR; 443 op[12] = ((d1 - c1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
446 444
447 ip++; 445 ip++;
448 op++; 446 op++;
449 } 447 }
450 } 448 }
451 449
452 void vp9_short_inv_walsh4x4_1_lossless_c(short *in, short *out) { 450 void vp9_short_inv_walsh4x4_1_lossless_c(int16_t *in, int16_t *out) {
453 int i; 451 int i;
454 short tmp[4]; 452 int16_t tmp[4];
455 short *ip = in; 453 int16_t *ip = in;
456 short *op = tmp; 454 int16_t *op = tmp;
457 455
458 op[0] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) + 1) >> 1; 456 op[0] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) + 1) >> 1;
459 op[1] = op[2] = op[3] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) >> 1); 457 op[1] = op[2] = op[3] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) >> 1);
460 458
461 ip = tmp; 459 ip = tmp;
462 op = out; 460 op = out;
463 for (i = 0; i < 4; i++) { 461 for (i = 0; i < 4; i++) {
464 op[0] = ((ip[0] + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR; 462 op[0] = ((ip[0] + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;
465 op[4] = op[8] = op[12] = ((ip[0] >> 1)) << Y2_WHT_UPSCALE_FACTOR; 463 op[4] = op[8] = op[12] = ((ip[0] >> 1)) << Y2_WHT_UPSCALE_FACTOR;
466 ip++; 464 ip++;
467 op++; 465 op++;
468 } 466 }
469 } 467 }
470 468
471 void vp9_short_inv_walsh4x4_x8_c(short *input, short *output, int pitch) { 469 void vp9_short_inv_walsh4x4_x8_c(int16_t *input, int16_t *output, int pitch) {
472 int i; 470 int i;
473 int a1, b1, c1, d1; 471 int a1, b1, c1, d1;
474 short *ip = input; 472 int16_t *ip = input;
475 short *op = output; 473 int16_t *op = output;
476 int shortpitch = pitch >> 1; 474 int shortpitch = pitch >> 1;
477 475
478 for (i = 0; i < 4; i++) { 476 for (i = 0; i < 4; i++) {
479 a1 = ((ip[0] + ip[3])) >> WHT_UPSCALE_FACTOR; 477 a1 = ((ip[0] + ip[3])) >> WHT_UPSCALE_FACTOR;
480 b1 = ((ip[1] + ip[2])) >> WHT_UPSCALE_FACTOR; 478 b1 = ((ip[1] + ip[2])) >> WHT_UPSCALE_FACTOR;
481 c1 = ((ip[1] - ip[2])) >> WHT_UPSCALE_FACTOR; 479 c1 = ((ip[1] - ip[2])) >> WHT_UPSCALE_FACTOR;
482 d1 = ((ip[0] - ip[3])) >> WHT_UPSCALE_FACTOR; 480 d1 = ((ip[0] - ip[3])) >> WHT_UPSCALE_FACTOR;
483 481
484 op[0] = (a1 + b1 + 1) >> 1; 482 op[0] = (a1 + b1 + 1) >> 1;
485 op[1] = (c1 + d1) >> 1; 483 op[1] = (c1 + d1) >> 1;
(...skipping 16 matching lines...) Expand all
502 op[shortpitch * 0] = (a1 + b1 + 1) >> 1; 500 op[shortpitch * 0] = (a1 + b1 + 1) >> 1;
503 op[shortpitch * 1] = (c1 + d1) >> 1; 501 op[shortpitch * 1] = (c1 + d1) >> 1;
504 op[shortpitch * 2] = (a1 - b1) >> 1; 502 op[shortpitch * 2] = (a1 - b1) >> 1;
505 op[shortpitch * 3] = (d1 - c1) >> 1; 503 op[shortpitch * 3] = (d1 - c1) >> 1;
506 504
507 ip++; 505 ip++;
508 op++; 506 op++;
509 } 507 }
510 } 508 }
511 509
512 void vp9_short_inv_walsh4x4_1_x8_c(short *in, short *out, int pitch) { 510 void vp9_short_inv_walsh4x4_1_x8_c(int16_t *in, int16_t *out, int pitch) {
513 int i; 511 int i;
514 short tmp[4]; 512 int16_t tmp[4];
515 short *ip = in; 513 int16_t *ip = in;
516 short *op = tmp; 514 int16_t *op = tmp;
517 int shortpitch = pitch >> 1; 515 int shortpitch = pitch >> 1;
518 516
519 op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1; 517 op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1;
520 op[1] = op[2] = op[3] = ((ip[0] >> WHT_UPSCALE_FACTOR) >> 1); 518 op[1] = op[2] = op[3] = ((ip[0] >> WHT_UPSCALE_FACTOR) >> 1);
521 519
522 520
523 ip = tmp; 521 ip = tmp;
524 op = out; 522 op = out;
525 for (i = 0; i < 4; i++) { 523 for (i = 0; i < 4; i++) {
526 op[shortpitch * 0] = (ip[0] + 1) >> 1; 524 op[shortpitch * 0] = (ip[0] + 1) >> 1;
527 op[shortpitch * 1] = op[shortpitch * 2] = op[shortpitch * 3] = ip[0] >> 1; 525 op[shortpitch * 1] = op[shortpitch * 2] = op[shortpitch * 3] = ip[0] >> 1;
528 ip++; 526 ip++;
529 op++; 527 op++;
530 } 528 }
531 } 529 }
532 530
533 void vp9_dc_only_inv_walsh_add_c(short input_dc, unsigned char *pred_ptr, 531 void vp9_dc_only_inv_walsh_add_c(short input_dc, uint8_t *pred_ptr,
534 unsigned char *dst_ptr, 532 uint8_t *dst_ptr,
535 int pitch, int stride) { 533 int pitch, int stride) {
536 int r, c; 534 int r, c;
537 short tmp[16]; 535 short tmp[16];
538 vp9_short_inv_walsh4x4_1_x8_c(&input_dc, tmp, 4 << 1); 536 vp9_short_inv_walsh4x4_1_x8_c(&input_dc, tmp, 4 << 1);
539 537
540 for (r = 0; r < 4; r++) { 538 for (r = 0; r < 4; r++) {
541 for (c = 0; c < 4; c++) { 539 for (c = 0; c < 4; c++) {
542 int a = tmp[r * 4 + c] + pred_ptr[c]; 540 dst_ptr[c] = clip_pixel(tmp[r * 4 + c] + pred_ptr[c]);
543 if (a < 0)
544 a = 0;
545
546 if (a > 255)
547 a = 255;
548
549 dst_ptr[c] = (unsigned char) a;
550 } 541 }
551 542
552 dst_ptr += stride; 543 dst_ptr += stride;
553 pred_ptr += pitch; 544 pred_ptr += pitch;
554 } 545 }
555 } 546 }
556 #endif 547 #endif
557 548
558 void vp9_dc_only_idct_add_8x8_c(short input_dc, 549 void vp9_dc_only_idct_add_8x8_c(short input_dc,
559 unsigned char *pred_ptr, 550 uint8_t *pred_ptr,
560 unsigned char *dst_ptr, 551 uint8_t *dst_ptr,
561 int pitch, int stride) { 552 int pitch, int stride) {
562 int a1 = ((input_dc + 16) >> 5); 553 int a1 = ((input_dc + 16) >> 5);
563 int r, c, b; 554 int r, c, b;
564 unsigned char *orig_pred = pred_ptr; 555 uint8_t *orig_pred = pred_ptr;
565 unsigned char *orig_dst = dst_ptr; 556 uint8_t *orig_dst = dst_ptr;
566 for (b = 0; b < 4; b++) { 557 for (b = 0; b < 4; b++) {
567 for (r = 0; r < 4; r++) { 558 for (r = 0; r < 4; r++) {
568 for (c = 0; c < 4; c++) { 559 for (c = 0; c < 4; c++) {
569 int a = a1 + pred_ptr[c]; 560 dst_ptr[c] = clip_pixel(a1 + pred_ptr[c]);
570
571 if (a < 0)
572 a = 0;
573
574 if (a > 255)
575 a = 255;
576
577 dst_ptr[c] = (unsigned char) a;
578 } 561 }
579 562
580 dst_ptr += stride; 563 dst_ptr += stride;
581 pred_ptr += pitch; 564 pred_ptr += pitch;
582 } 565 }
583 dst_ptr = orig_dst + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * stride; 566 dst_ptr = orig_dst + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * stride;
584 pred_ptr = orig_pred + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * pitch; 567 pred_ptr = orig_pred + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * pitch;
585 } 568 }
586 } 569 }
587 570
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after
655 * 638 *
656 * where: c[0] = 1/1024 c[1..7] = (1/1024)*sqrt(2) */ 639 * where: c[0] = 1/1024 c[1..7] = (1/1024)*sqrt(2) */
657 static void idctcol(int *blk) { 640 static void idctcol(int *blk) {
658 int x0, x1, x2, x3, x4, x5, x6, x7, x8; 641 int x0, x1, x2, x3, x4, x5, x6, x7, x8;
659 642
660 /* shortcut */ 643 /* shortcut */
661 if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) | 644 if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) |
662 (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) | 645 (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) |
663 (x7 = blk[8 * 3]))) { 646 (x7 = blk[8 * 3]))) {
664 blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3] 647 blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3]
665 = blk[8 * 4] = blk[8 * 5] = blk[8 * 6 ] 648 = blk[8 * 4] = blk[8 * 5] = blk[8 * 6]
666 = blk[8 * 7] = ((blk[8 * 0] + 32) >> 6); 649 = blk[8 * 7] = ((blk[8 * 0] + 32) >> 6);
667 return; 650 return;
668 } 651 }
669 652
670 x0 = (blk[8 * 0] << 8) + 16384; 653 x0 = (blk[8 * 0] << 8) + 16384;
671 654
672 /* first stage */ 655 /* first stage */
673 x8 = W7 * (x4 + x5) + 4; 656 x8 = W7 * (x4 + x5) + 4;
674 x4 = (x8 + (W1 - W7) * x4) >> 3; 657 x4 = (x8 + (W1 - W7) * x4) >> 3;
675 x5 = (x8 - (W1 + W7) * x5) >> 3; 658 x5 = (x8 - (W1 + W7) * x5) >> 3;
676 x8 = W3 * (x6 + x7) + 4; 659 x8 = W3 * (x6 + x7) + 4;
(...skipping 24 matching lines...) Expand all
701 blk[8 * 1] = (x3 + x2) >> 14; 684 blk[8 * 1] = (x3 + x2) >> 14;
702 blk[8 * 2] = (x0 + x4) >> 14; 685 blk[8 * 2] = (x0 + x4) >> 14;
703 blk[8 * 3] = (x8 + x6) >> 14; 686 blk[8 * 3] = (x8 + x6) >> 14;
704 blk[8 * 4] = (x8 - x6) >> 14; 687 blk[8 * 4] = (x8 - x6) >> 14;
705 blk[8 * 5] = (x0 - x4) >> 14; 688 blk[8 * 5] = (x0 - x4) >> 14;
706 blk[8 * 6] = (x3 - x2) >> 14; 689 blk[8 * 6] = (x3 - x2) >> 14;
707 blk[8 * 7] = (x7 - x1) >> 14; 690 blk[8 * 7] = (x7 - x1) >> 14;
708 } 691 }
709 692
710 #define TX_DIM 8 693 #define TX_DIM 8
711 void vp9_short_idct8x8_c(short *coefs, short *block, int pitch) { 694 void vp9_short_idct8x8_c(int16_t *coefs, int16_t *block, int pitch) {
712 int X[TX_DIM * TX_DIM]; 695 int X[TX_DIM * TX_DIM];
713 int i, j; 696 int i, j;
714 int shortpitch = pitch >> 1; 697 int shortpitch = pitch >> 1;
715 698
716 for (i = 0; i < TX_DIM; i++) { 699 for (i = 0; i < TX_DIM; i++) {
717 for (j = 0; j < TX_DIM; j++) { 700 for (j = 0; j < TX_DIM; j++) {
718 X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1 701 X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1
719 + (coefs[i * TX_DIM + j] < 0)) >> 2; 702 + (coefs[i * TX_DIM + j] < 0)) >> 2;
720 } 703 }
721 } 704 }
(...skipping 98 matching lines...) Expand 10 before | Expand all | Expand 10 after
820 blk[8 * 0] = (x7 + x1) >> 14; 803 blk[8 * 0] = (x7 + x1) >> 14;
821 blk[8 * 1] = (x3 + x2) >> 14; 804 blk[8 * 1] = (x3 + x2) >> 14;
822 blk[8 * 2] = (x0 + x4) >> 14; 805 blk[8 * 2] = (x0 + x4) >> 14;
823 blk[8 * 3] = (x8 + x6) >> 14; 806 blk[8 * 3] = (x8 + x6) >> 14;
824 blk[8 * 4] = (x8 - x6) >> 14; 807 blk[8 * 4] = (x8 - x6) >> 14;
825 blk[8 * 5] = (x0 - x4) >> 14; 808 blk[8 * 5] = (x0 - x4) >> 14;
826 blk[8 * 6] = (x3 - x2) >> 14; 809 blk[8 * 6] = (x3 - x2) >> 14;
827 blk[8 * 7] = (x7 - x1) >> 14; 810 blk[8 * 7] = (x7 - x1) >> 14;
828 } 811 }
829 812
830 void vp9_short_idct10_8x8_c(short *coefs, short *block, int pitch) { 813 void vp9_short_idct10_8x8_c(int16_t *coefs, int16_t *block, int pitch) {
831 int X[TX_DIM * TX_DIM]; 814 int X[TX_DIM * TX_DIM];
832 int i, j; 815 int i, j;
833 int shortpitch = pitch >> 1; 816 int shortpitch = pitch >> 1;
834 817
835 for (i = 0; i < TX_DIM; i++) { 818 for (i = 0; i < TX_DIM; i++) {
836 for (j = 0; j < TX_DIM; j++) { 819 for (j = 0; j < TX_DIM; j++) {
837 X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1 820 X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1
838 + (coefs[i * TX_DIM + j] < 0)) >> 2; 821 + (coefs[i * TX_DIM + j] < 0)) >> 2;
839 } 822 }
840 } 823 }
841 824
842 /* Do first 4 row idct only since non-zero dct coefficients are all in 825 /* Do first 4 row idct only since non-zero dct coefficients are all in
843 * upper-left 4x4 area. */ 826 * upper-left 4x4 area. */
844 for (i = 0; i < 4; i++) 827 for (i = 0; i < 4; i++)
845 idctrow10(X + 8 * i); 828 idctrow10(X + 8 * i);
846 829
847 for (i = 0; i < 8; i++) 830 for (i = 0; i < 8; i++)
848 idctcol10(X + i); 831 idctcol10(X + i);
849 832
850 for (i = 0; i < TX_DIM; i++) { 833 for (i = 0; i < TX_DIM; i++) {
851 for (j = 0; j < TX_DIM; j++) { 834 for (j = 0; j < TX_DIM; j++) {
852 block[i * shortpitch + j] = X[i * TX_DIM + j] >> 1; 835 block[i * shortpitch + j] = X[i * TX_DIM + j] >> 1;
853 } 836 }
854 } 837 }
855 } 838 }
856 839
857 void vp9_short_ihaar2x2_c(short *input, short *output, int pitch) { 840 void vp9_short_ihaar2x2_c(int16_t *input, int16_t *output, int pitch) {
858 int i; 841 int i;
859 short *ip = input; // 0,1, 4, 8 842 int16_t *ip = input; // 0, 1, 4, 8
860 short *op = output; 843 int16_t *op = output;
861 for (i = 0; i < 16; i++) { 844 for (i = 0; i < 16; i++) {
862 op[i] = 0; 845 op[i] = 0;
863 } 846 }
864 847
865 op[0] = (ip[0] + ip[1] + ip[4] + ip[8] + 1) >> 1; 848 op[0] = (ip[0] + ip[1] + ip[4] + ip[8] + 1) >> 1;
866 op[1] = (ip[0] - ip[1] + ip[4] - ip[8]) >> 1; 849 op[1] = (ip[0] - ip[1] + ip[4] - ip[8]) >> 1;
867 op[4] = (ip[0] + ip[1] - ip[4] - ip[8]) >> 1; 850 op[4] = (ip[0] + ip[1] - ip[4] - ip[8]) >> 1;
868 op[8] = (ip[0] - ip[1] - ip[4] + ip[8]) >> 1; 851 op[8] = (ip[0] - ip[1] - ip[4] + ip[8]) >> 1;
869 } 852 }
870 853
871 854
872 #if 0 855 #if 0
873 // Keep a really bad float version as reference for now. 856 // Keep a really bad float version as reference for now.
874 void vp9_short_idct16x16_c(short *input, short *output, int pitch) { 857 void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {
875 858
876 vp9_clear_system_state(); // Make it simd safe : __asm emms; 859 vp9_clear_system_state(); // Make it simd safe : __asm emms;
877 { 860 {
878 double x; 861 double x;
879 const int short_pitch = pitch >> 1; 862 const int short_pitch = pitch >> 1;
880 int i, j, k, l; 863 int i, j, k, l;
881 for (l = 0; l < 16; ++l) { 864 for (l = 0; l < 16; ++l) {
882 for (k = 0; k < 16; ++k) { 865 for (k = 0; k < 16; ++k) {
883 double s = 0; 866 double s = 0;
884 for (i = 0; i < 16; ++i) { 867 for (i = 0; i < 16; ++i) {
885 for (j = 0; j < 16; ++j) { 868 for (j = 0; j < 16; ++j) {
886 x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/32; 869 x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/32;
887 if (i != 0) 870 if (i != 0)
888 x *= sqrt(2.0); 871 x *= sqrt(2.0);
889 if (j != 0) 872 if (j != 0)
890 x *= sqrt(2.0); 873 x *= sqrt(2.0);
891 s += x; 874 s += x;
892 } 875 }
893 } 876 }
894 output[k*short_pitch+l] = (short)round(s); 877 output[k*short_pitch+l] = (short)round(s);
895 } 878 }
896 } 879 }
897 } 880 }
898 vp9_clear_system_state(); // Make it simd safe : __asm emms; 881 vp9_clear_system_state(); // Make it simd safe : __asm emms;
899 } 882 }
900 #endif 883 #endif
901 884
902 #define TEST_INT_16x16_IDCT 1 885 #define TEST_INT_16x16_IDCT 1
903 #if !TEST_INT_16x16_IDCT 886 #if !TEST_INT_16x16_IDCT
904 static const double C1 = 0.995184726672197;
905 static const double C2 = 0.98078528040323;
906 static const double C3 = 0.956940335732209;
907 static const double C4 = 0.923879532511287;
908 static const double C5 = 0.881921264348355;
909 static const double C6 = 0.831469612302545;
910 static const double C7 = 0.773010453362737;
911 static const double C8 = 0.707106781186548;
912 static const double C9 = 0.634393284163646;
913 static const double C10 = 0.555570233019602;
914 static const double C11 = 0.471396736825998;
915 static const double C12 = 0.38268343236509;
916 static const double C13 = 0.290284677254462;
917 static const double C14 = 0.195090322016128;
918 static const double C15 = 0.098017140329561;
919
920 887
921 static void butterfly_16x16_idct_1d(double input[16], double output[16]) { 888 static void butterfly_16x16_idct_1d(double input[16], double output[16]) {
922 889
890 static const double C1 = 0.995184726672197;
891 static const double C2 = 0.98078528040323;
892 static const double C3 = 0.956940335732209;
893 static const double C4 = 0.923879532511287;
894 static const double C5 = 0.881921264348355;
895 static const double C6 = 0.831469612302545;
896 static const double C7 = 0.773010453362737;
897 static const double C8 = 0.707106781186548;
898 static const double C9 = 0.634393284163646;
899 static const double C10 = 0.555570233019602;
900 static const double C11 = 0.471396736825998;
901 static const double C12 = 0.38268343236509;
902 static const double C13 = 0.290284677254462;
903 static const double C14 = 0.195090322016128;
904 static const double C15 = 0.098017140329561;
905
923 vp9_clear_system_state(); // Make it simd safe : __asm emms; 906 vp9_clear_system_state(); // Make it simd safe : __asm emms;
924 { 907 {
925 double step[16]; 908 double step[16];
926 double intermediate[16]; 909 double intermediate[16];
927 double temp1, temp2; 910 double temp1, temp2;
928 911
929 912
930 // step 1 and 2 913 // step 1 and 2
931 step[ 0] = input[0] + input[8]; 914 step[ 0] = input[0] + input[8];
932 step[ 1] = input[0] - input[8]; 915 step[ 1] = input[0] - input[8];
(...skipping 191 matching lines...) Expand 10 before | Expand all | Expand 10 after
1124 output[k] += input[n]*cos(kPi*(2*k+1)*n/32.0); 1107 output[k] += input[n]*cos(kPi*(2*k+1)*n/32.0);
1125 if (n == 0) 1108 if (n == 0)
1126 output[k] = output[k]/kSqrt2; 1109 output[k] = output[k]/kSqrt2;
1127 } 1110 }
1128 } 1111 }
1129 } 1112 }
1130 vp9_clear_system_state(); // Make it simd safe : __asm emms; 1113 vp9_clear_system_state(); // Make it simd safe : __asm emms;
1131 } 1114 }
1132 #endif 1115 #endif
1133 1116
1134 void vp9_short_idct16x16_c(short *input, short *output, int pitch) { 1117 void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {
1135 1118
1136 vp9_clear_system_state(); // Make it simd safe : __asm emms; 1119 vp9_clear_system_state(); // Make it simd safe : __asm emms;
1137 { 1120 {
1138 double out[16*16], out2[16*16]; 1121 double out[16*16], out2[16*16];
1139 const int short_pitch = pitch >> 1; 1122 const int short_pitch = pitch >> 1;
1140 int i, j; 1123 int i, j;
1141 // First transform rows 1124 // First transform rows
1142 for (i = 0; i < 16; ++i) { 1125 for (i = 0; i < 16; ++i) {
1143 double temp_in[16], temp_out[16]; 1126 double temp_in[16], temp_out[16];
1144 for (j = 0; j < 16; ++j) 1127 for (j = 0; j < 16; ++j)
(...skipping 11 matching lines...) Expand all
1156 for (j = 0; j < 16; ++j) 1139 for (j = 0; j < 16; ++j)
1157 out2[j*16 + i] = temp_out[j]; 1140 out2[j*16 + i] = temp_out[j];
1158 } 1141 }
1159 for (i = 0; i < 16*16; ++i) 1142 for (i = 0; i < 16*16; ++i)
1160 output[i] = round(out2[i]/128); 1143 output[i] = round(out2[i]/128);
1161 } 1144 }
1162 vp9_clear_system_state(); // Make it simd safe : __asm emms; 1145 vp9_clear_system_state(); // Make it simd safe : __asm emms;
1163 } 1146 }
1164 1147
1165 #else 1148 #else
1149
1150 #define INITIAL_SHIFT 2
1151 #define INITIAL_ROUNDING (1 << (INITIAL_SHIFT - 1))
1152 #define RIGHT_SHIFT 14
1153 #define RIGHT_ROUNDING (1 << (RIGHT_SHIFT - 1))
1154
1166 static const int16_t C1 = 16305; 1155 static const int16_t C1 = 16305;
1167 static const int16_t C2 = 16069; 1156 static const int16_t C2 = 16069;
1168 static const int16_t C3 = 15679; 1157 static const int16_t C3 = 15679;
1169 static const int16_t C4 = 15137; 1158 static const int16_t C4 = 15137;
1170 static const int16_t C5 = 14449; 1159 static const int16_t C5 = 14449;
1171 static const int16_t C6 = 13623; 1160 static const int16_t C6 = 13623;
1172 static const int16_t C7 = 12665; 1161 static const int16_t C7 = 12665;
1173 static const int16_t C8 = 11585; 1162 static const int16_t C8 = 11585;
1174 static const int16_t C9 = 10394; 1163 static const int16_t C9 = 10394;
1175 static const int16_t C10 = 9102; 1164 static const int16_t C10 = 9102;
1176 static const int16_t C11 = 7723; 1165 static const int16_t C11 = 7723;
1177 static const int16_t C12 = 6270; 1166 static const int16_t C12 = 6270;
1178 static const int16_t C13 = 4756; 1167 static const int16_t C13 = 4756;
1179 static const int16_t C14 = 3196; 1168 static const int16_t C14 = 3196;
1180 static const int16_t C15 = 1606; 1169 static const int16_t C15 = 1606;
1181 1170
1182 #define INITIAL_SHIFT 2
1183 #define INITIAL_ROUNDING (1 << (INITIAL_SHIFT - 1))
1184 #define RIGHT_SHIFT 14
1185 #define RIGHT_ROUNDING (1 << (RIGHT_SHIFT - 1))
1186
1187 static void butterfly_16x16_idct_1d(int16_t input[16], int16_t output[16], 1171 static void butterfly_16x16_idct_1d(int16_t input[16], int16_t output[16],
1188 int last_shift_bits) { 1172 int last_shift_bits) {
1189 int16_t step[16]; 1173 int16_t step[16];
1190 int intermediate[16]; 1174 int intermediate[16];
1191 int temp1, temp2; 1175 int temp1, temp2;
1192 1176
1193 int step1_shift = RIGHT_SHIFT + INITIAL_SHIFT; 1177 int step1_shift = RIGHT_SHIFT + INITIAL_SHIFT;
1194 int step1_rounding = 1 << (step1_shift - 1); 1178 int step1_rounding = 1 << (step1_shift - 1);
1195 int last_rounding = 0; 1179 int last_rounding = 0;
1196 1180
1197 if (last_shift_bits > 0) 1181 if (last_shift_bits > 0)
1198 last_rounding = 1 << (last_shift_bits - 1); 1182 last_rounding = 1 << (last_shift_bits - 1);
1199 1183
1200 // step 1 and 2 1184 // step 1 and 2
1201 step[ 0] = (input[0] + input[8] + INITIAL_ROUNDING) >> INITIAL_SHIFT; 1185 step[ 0] = (input[0] + input[8] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
1202 step[ 1] = (input[0] - input[8] + INITIAL_ROUNDING) >> INITIAL_SHIFT; 1186 step[ 1] = (input[0] - input[8] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
1203 1187
1204 temp1 = input[4] * C12; 1188 temp1 = input[4] * C12;
1205 temp2 = input[12] * C4; 1189 temp2 = input[12] * C4;
1206 temp1 = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; 1190 temp1 = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1207 temp1 *= C8; 1191 temp1 *= C8;
1208 step[ 2] = (2 * (temp1) + step1_rounding) >> step1_shift; 1192 step[ 2] = (2 * (temp1) + step1_rounding) >> step1_shift;
1209 1193
1210 temp1 = input[4] * C4; 1194 temp1 = input[4] * C4;
1211 temp2 = input[12] * C12; 1195 temp2 = input[12] * C12;
1212 temp1 = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; 1196 temp1 = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1213 temp1 *= C8; 1197 temp1 *= C8;
1214 step[ 3] = (2 * (temp1) + step1_rounding) >> step1_shift; 1198 step[ 3] = (2 * (temp1) + step1_rounding) >> step1_shift;
1215 1199
1216 temp1 = input[2] * C8; 1200 temp1 = input[2] * C8;
1217 temp1 = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; 1201 temp1 = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1218 temp2 = input[6] + input[10]; 1202 temp2 = input[6] + input[10];
1219 step[ 4] = (temp1 + temp2 + INITIAL_ROUNDING) >> INITIAL_SHIFT; 1203 step[ 4] = (temp1 + temp2 + INITIAL_ROUNDING) >> INITIAL_SHIFT;
1220 step[ 5] = (temp1 - temp2 + INITIAL_ROUNDING) >> INITIAL_SHIFT; 1204 step[ 5] = (temp1 - temp2 + INITIAL_ROUNDING) >> INITIAL_SHIFT;
1221 1205
1222 temp1 = input[14] * C8; 1206 temp1 = input[14] * C8;
1223 temp1 = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; 1207 temp1 = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1224 temp2 = input[6] - input[10]; 1208 temp2 = input[6] - input[10];
1225 step[ 6] = (temp2 - temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT; 1209 step[ 6] = (temp2 - temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT;
1226 step[ 7] = (temp2 + temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT; 1210 step[ 7] = (temp2 + temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT;
1227 1211
1228 // for odd input 1212 // for odd input
1229 temp1 = input[3] * C12; 1213 temp1 = input[3] * C12;
1230 temp2 = input[13] * C4; 1214 temp2 = input[13] * C4;
1231 temp1 = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; 1215 temp1 = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1232 temp1 *= C8; 1216 temp1 *= C8;
1233 intermediate[ 8] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; 1217 intermediate[ 8] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1234 1218
1235 temp1 = input[3] * C4; 1219 temp1 = input[3] * C4;
1236 temp2 = input[13] * C12; 1220 temp2 = input[13] * C12;
1237 temp2 = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; 1221 temp2 = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1238 temp2 *= C8; 1222 temp2 *= C8;
1239 intermediate[ 9] = (2 * (temp2) + RIGHT_ROUNDING) >> RIGHT_SHIFT; 1223 intermediate[ 9] = (2 * (temp2) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1240 1224
1241 intermediate[10] = (2 * (input[9] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT; 1225 intermediate[10] = (2 * (input[9] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1242 intermediate[11] = input[15] - input[1]; 1226 intermediate[11] = input[15] - input[1];
1243 intermediate[12] = input[15] + input[1]; 1227 intermediate[12] = input[15] + input[1];
1244 intermediate[13] = (2 * (input[7] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT; 1228 intermediate[13] = (2 * (input[7] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1245 1229
1246 temp1 = input[11] * C12; 1230 temp1 = input[11] * C12;
1247 temp2 = input[5] * C4; 1231 temp2 = input[5] * C4;
1248 temp2 = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; 1232 temp2 = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1249 temp2 *= C8; 1233 temp2 *= C8;
1250 intermediate[14] = (2 * (temp2) + RIGHT_ROUNDING) >> RIGHT_SHIFT; 1234 intermediate[14] = (2 * (temp2) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1251 1235
1252 temp1 = input[11] * C4; 1236 temp1 = input[11] * C4;
1253 temp2 = input[5] * C12; 1237 temp2 = input[5] * C12;
1254 temp1 = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; 1238 temp1 = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1255 temp1 *= C8; 1239 temp1 *= C8;
1256 intermediate[15] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; 1240 intermediate[15] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1257 1241
1258 step[ 8] = (intermediate[ 8] + intermediate[14] + INITIAL_ROUNDING) 1242 step[ 8] = (intermediate[ 8] + intermediate[14] + INITIAL_ROUNDING)
1259 >> INITIAL_SHIFT; 1243 >> INITIAL_SHIFT;
1260 step[ 9] = (intermediate[ 9] + intermediate[15] + INITIAL_ROUNDING) 1244 step[ 9] = (intermediate[ 9] + intermediate[15] + INITIAL_ROUNDING)
1261 >> INITIAL_SHIFT; 1245 >> INITIAL_SHIFT;
1262 step[10] = (intermediate[10] + intermediate[11] + INITIAL_ROUNDING) 1246 step[10] = (intermediate[10] + intermediate[11] + INITIAL_ROUNDING)
1263 >> INITIAL_SHIFT; 1247 >> INITIAL_SHIFT;
1264 step[11] = (intermediate[10] - intermediate[11] + INITIAL_ROUNDING) 1248 step[11] = (intermediate[10] - intermediate[11] + INITIAL_ROUNDING)
1265 >> INITIAL_SHIFT; 1249 >> INITIAL_SHIFT;
1266 step[12] = (intermediate[12] + intermediate[13] + INITIAL_ROUNDING) 1250 step[12] = (intermediate[12] + intermediate[13] + INITIAL_ROUNDING)
1267 >> INITIAL_SHIFT; 1251 >> INITIAL_SHIFT;
1268 step[13] = (intermediate[12] - intermediate[13] + INITIAL_ROUNDING) 1252 step[13] = (intermediate[12] - intermediate[13] + INITIAL_ROUNDING)
1269 >> INITIAL_SHIFT; 1253 >> INITIAL_SHIFT;
1270 step[14] = (intermediate[ 8] - intermediate[14] + INITIAL_ROUNDING) 1254 step[14] = (intermediate[ 8] - intermediate[14] + INITIAL_ROUNDING)
1271 >> INITIAL_SHIFT; 1255 >> INITIAL_SHIFT;
1272 step[15] = (intermediate[ 9] - intermediate[15] + INITIAL_ROUNDING) 1256 step[15] = (intermediate[ 9] - intermediate[15] + INITIAL_ROUNDING)
1273 >> INITIAL_SHIFT; 1257 >> INITIAL_SHIFT;
1274 1258
1275 // step 3 1259 // step 3
1276 output[0] = step[ 0] + step[ 3]; 1260 output[0] = step[ 0] + step[ 3];
1277 output[1] = step[ 1] + step[ 2]; 1261 output[1] = step[ 1] + step[ 2];
1278 output[2] = step[ 1] - step[ 2]; 1262 output[2] = step[ 1] - step[ 2];
1279 output[3] = step[ 0] - step[ 3]; 1263 output[3] = step[ 0] - step[ 3];
1280 1264
1281 temp1 = step[ 4] * C14; 1265 temp1 = step[ 4] * C14;
1282 temp2 = step[ 7] * C2; 1266 temp2 = step[ 7] * C2;
1283 output[4] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; 1267 output[4] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1284 1268
1285 temp1 = step[ 4] * C2; 1269 temp1 = step[ 4] * C2;
1286 temp2 = step[ 7] * C14; 1270 temp2 = step[ 7] * C14;
1287 output[7] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; 1271 output[7] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1288 1272
1289 temp1 = step[ 5] * C10; 1273 temp1 = step[ 5] * C10;
1290 temp2 = step[ 6] * C6; 1274 temp2 = step[ 6] * C6;
1291 output[5] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; 1275 output[5] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1292 1276
1293 temp1 = step[ 5] * C6; 1277 temp1 = step[ 5] * C6;
1294 temp2 = step[ 6] * C10; 1278 temp2 = step[ 6] * C10;
1295 output[6] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; 1279 output[6] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1296 1280
1297 output[8] = step[ 8] + step[11]; 1281 output[8] = step[ 8] + step[11];
1298 output[9] = step[ 9] + step[10]; 1282 output[9] = step[ 9] + step[10];
1299 output[10] = step[ 9] - step[10]; 1283 output[10] = step[ 9] - step[10];
1300 output[11] = step[ 8] - step[11]; 1284 output[11] = step[ 8] - step[11];
1301 output[12] = step[12] + step[15]; 1285 output[12] = step[12] + step[15];
1302 output[13] = step[13] + step[14]; 1286 output[13] = step[13] + step[14];
1303 output[14] = step[13] - step[14]; 1287 output[14] = step[13] - step[14];
1304 output[15] = step[12] - step[15]; 1288 output[15] = step[12] - step[15];
1305 1289
1306 // output 4 1290 // output 4
1307 step[ 0] = output[0] + output[7]; 1291 step[ 0] = output[0] + output[7];
1308 step[ 1] = output[1] + output[6]; 1292 step[ 1] = output[1] + output[6];
1309 step[ 2] = output[2] + output[5]; 1293 step[ 2] = output[2] + output[5];
1310 step[ 3] = output[3] + output[4]; 1294 step[ 3] = output[3] + output[4];
1311 step[ 4] = output[3] - output[4]; 1295 step[ 4] = output[3] - output[4];
1312 step[ 5] = output[2] - output[5]; 1296 step[ 5] = output[2] - output[5];
1313 step[ 6] = output[1] - output[6]; 1297 step[ 6] = output[1] - output[6];
1314 step[ 7] = output[0] - output[7]; 1298 step[ 7] = output[0] - output[7];
1315 1299
1316 temp1 = output[8] * C7; 1300 temp1 = output[8] * C7;
1317 temp2 = output[15] * C9; 1301 temp2 = output[15] * C9;
1318 step[ 8] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; 1302 step[ 8] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1319 1303
1320 temp1 = output[9] * C11; 1304 temp1 = output[9] * C11;
1321 temp2 = output[14] * C5; 1305 temp2 = output[14] * C5;
1322 step[ 9] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; 1306 step[ 9] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1323 1307
1324 temp1 = output[10] * C3; 1308 temp1 = output[10] * C3;
1325 temp2 = output[13] * C13; 1309 temp2 = output[13] * C13;
1326 step[10] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; 1310 step[10] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1327 1311
1328 temp1 = output[11] * C15; 1312 temp1 = output[11] * C15;
1329 temp2 = output[12] * C1; 1313 temp2 = output[12] * C1;
1330 step[11] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; 1314 step[11] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1331 1315
1332 temp1 = output[11] * C1; 1316 temp1 = output[11] * C1;
1333 temp2 = output[12] * C15; 1317 temp2 = output[12] * C15;
1334 step[12] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; 1318 step[12] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1335 1319
1336 temp1 = output[10] * C13; 1320 temp1 = output[10] * C13;
1337 temp2 = output[13] * C3; 1321 temp2 = output[13] * C3;
1338 step[13] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; 1322 step[13] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1339 1323
1340 temp1 = output[9] * C5; 1324 temp1 = output[9] * C5;
1341 temp2 = output[14] * C11; 1325 temp2 = output[14] * C11;
1342 step[14] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; 1326 step[14] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1343 1327
1344 temp1 = output[8] * C9; 1328 temp1 = output[8] * C9;
1345 temp2 = output[15] * C7; 1329 temp2 = output[15] * C7;
1346 step[15] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; 1330 step[15] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
1347 1331
1348 // step 5 1332 // step 5
1349 output[0] = (step[0] + step[15] + last_rounding) >> last_shift_bits; 1333 output[0] = (step[0] + step[15] + last_rounding) >> last_shift_bits;
1350 output[1] = (step[1] + step[14] + last_rounding) >> last_shift_bits; 1334 output[1] = (step[1] + step[14] + last_rounding) >> last_shift_bits;
1351 output[2] = (step[2] + step[13] + last_rounding) >> last_shift_bits; 1335 output[2] = (step[2] + step[13] + last_rounding) >> last_shift_bits;
1352 output[3] = (step[3] + step[12] + last_rounding) >> last_shift_bits; 1336 output[3] = (step[3] + step[12] + last_rounding) >> last_shift_bits;
1353 output[4] = (step[4] + step[11] + last_rounding) >> last_shift_bits; 1337 output[4] = (step[4] + step[11] + last_rounding) >> last_shift_bits;
1354 output[5] = (step[5] + step[10] + last_rounding) >> last_shift_bits; 1338 output[5] = (step[5] + step[10] + last_rounding) >> last_shift_bits;
1355 output[6] = (step[6] + step[ 9] + last_rounding) >> last_shift_bits; 1339 output[6] = (step[6] + step[ 9] + last_rounding) >> last_shift_bits;
1356 output[7] = (step[7] + step[ 8] + last_rounding) >> last_shift_bits; 1340 output[7] = (step[7] + step[ 8] + last_rounding) >> last_shift_bits;
1357 1341
1358 output[15] = (step[0] - step[15] + last_rounding) >> last_shift_bits; 1342 output[15] = (step[0] - step[15] + last_rounding) >> last_shift_bits;
1359 output[14] = (step[1] - step[14] + last_rounding) >> last_shift_bits; 1343 output[14] = (step[1] - step[14] + last_rounding) >> last_shift_bits;
1360 output[13] = (step[2] - step[13] + last_rounding) >> last_shift_bits; 1344 output[13] = (step[2] - step[13] + last_rounding) >> last_shift_bits;
1361 output[12] = (step[3] - step[12] + last_rounding) >> last_shift_bits; 1345 output[12] = (step[3] - step[12] + last_rounding) >> last_shift_bits;
1362 output[11] = (step[4] - step[11] + last_rounding) >> last_shift_bits; 1346 output[11] = (step[4] - step[11] + last_rounding) >> last_shift_bits;
1363 output[10] = (step[5] - step[10] + last_rounding) >> last_shift_bits; 1347 output[10] = (step[5] - step[10] + last_rounding) >> last_shift_bits;
1364 output[9] = (step[6] - step[ 9] + last_rounding) >> last_shift_bits; 1348 output[9] = (step[6] - step[ 9] + last_rounding) >> last_shift_bits;
1365 output[8] = (step[7] - step[ 8] + last_rounding) >> last_shift_bits; 1349 output[8] = (step[7] - step[ 8] + last_rounding) >> last_shift_bits;
1366 } 1350 }
1367 1351
1368 void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) { 1352 void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {
1369 int16_t out[16 * 16]; 1353 int16_t out[16 * 16];
1370 int16_t *outptr = &out[0]; 1354 int16_t *outptr = &out[0];
1371 const int short_pitch = pitch >> 1; 1355 const int short_pitch = pitch >> 1;
1372 int i, j; 1356 int i, j;
1373 int16_t temp_in[16], temp_out[16]; 1357 int16_t temp_in[16], temp_out[16];
1374 1358
1375 // First transform rows 1359 // First transform rows
1376 for (i = 0; i < 16; ++i) { 1360 for (i = 0; i < 16; ++i) {
1377 butterfly_16x16_idct_1d(input, outptr, 0); 1361 butterfly_16x16_idct_1d(input, outptr, 0);
1378 input += short_pitch; 1362 input += short_pitch;
1379 outptr += 16; 1363 outptr += 16;
1380 } 1364 }
1381 1365
1382 // Then transform columns 1366 // Then transform columns
1383 for (i = 0; i < 16; ++i) { 1367 for (i = 0; i < 16; ++i) {
1384 for (j = 0; j < 16; ++j) 1368 for (j = 0; j < 16; ++j)
1385 temp_in[j] = out[j * 16 + i]; 1369 temp_in[j] = out[j * 16 + i];
1386 butterfly_16x16_idct_1d(temp_in, temp_out, 3); 1370 butterfly_16x16_idct_1d(temp_in, temp_out, 3);
1387 for (j = 0; j < 16; ++j) 1371 for (j = 0; j < 16; ++j)
1388 output[j * 16 + i] = temp_out[j]; 1372 output[j * 16 + i] = temp_out[j];
1389 } 1373 }
1390 } 1374 }
1391 1375
1392 /* The following function is called when we know the maximum number of non-zero 1376 /* The following function is called when we know the maximum number of non-zero
1393 * dct coefficients is less or equal 10. 1377 * dct coefficients is less or equal 10.
1394 */ 1378 */
1395 static void butterfly_16x16_idct10_1d(int16_t input[16], int16_t output[16], 1379 static void butterfly_16x16_idct10_1d(int16_t input[16], int16_t output[16],
1396 int last_shift_bits) { 1380 int last_shift_bits) {
1397 int16_t step[16] = {0}; 1381 int16_t step[16] = {0};
(...skipping 143 matching lines...) Expand 10 before | Expand all | Expand 10 after
1541 butterfly_16x16_idct10_1d(temp_in, temp_out, 3); 1525 butterfly_16x16_idct10_1d(temp_in, temp_out, 3);
1542 for (j = 0; j < 16; ++j) 1526 for (j = 0; j < 16; ++j)
1543 output[j*16 + i] = temp_out[j]; 1527 output[j*16 + i] = temp_out[j];
1544 } 1528 }
1545 } 1529 }
1546 #undef INITIAL_SHIFT 1530 #undef INITIAL_SHIFT
1547 #undef INITIAL_ROUNDING 1531 #undef INITIAL_ROUNDING
1548 #undef RIGHT_SHIFT 1532 #undef RIGHT_SHIFT
1549 #undef RIGHT_ROUNDING 1533 #undef RIGHT_ROUNDING
1550 #endif 1534 #endif
1535
1536 #if !CONFIG_DWTDCTHYBRID
1537 #define DownshiftMultiplyBy2(x) x * 2
1538 #define DownshiftMultiply(x) x
1539
1540 static void idct16(double *input, double *output, int stride) {
1541 static const double C1 = 0.995184726672197;
1542 static const double C2 = 0.98078528040323;
1543 static const double C3 = 0.956940335732209;
1544 static const double C4 = 0.923879532511287;
1545 static const double C5 = 0.881921264348355;
1546 static const double C6 = 0.831469612302545;
1547 static const double C7 = 0.773010453362737;
1548 static const double C8 = 0.707106781186548;
1549 static const double C9 = 0.634393284163646;
1550 static const double C10 = 0.555570233019602;
1551 static const double C11 = 0.471396736825998;
1552 static const double C12 = 0.38268343236509;
1553 static const double C13 = 0.290284677254462;
1554 static const double C14 = 0.195090322016128;
1555 static const double C15 = 0.098017140329561;
1556
1557 double step[16];
1558 double intermediate[16];
1559 double temp1, temp2;
1560
1561 // step 1 and 2
1562 step[ 0] = input[stride*0] + input[stride*8];
1563 step[ 1] = input[stride*0] - input[stride*8];
1564
1565 temp1 = input[stride*4]*C12;
1566 temp2 = input[stride*12]*C4;
1567
1568 temp1 -= temp2;
1569 temp1 = DownshiftMultiply(temp1);
1570 temp1 *= C8;
1571
1572 step[ 2] = DownshiftMultiplyBy2(temp1);
1573
1574 temp1 = input[stride*4]*C4;
1575 temp2 = input[stride*12]*C12;
1576 temp1 += temp2;
1577 temp1 = DownshiftMultiply(temp1);
1578 temp1 *= C8;
1579 step[ 3] = DownshiftMultiplyBy2(temp1);
1580
1581 temp1 = input[stride*2]*C8;
1582 temp1 = DownshiftMultiplyBy2(temp1);
1583 temp2 = input[stride*6] + input[stride*10];
1584
1585 step[ 4] = temp1 + temp2;
1586 step[ 5] = temp1 - temp2;
1587
1588 temp1 = input[stride*14]*C8;
1589 temp1 = DownshiftMultiplyBy2(temp1);
1590 temp2 = input[stride*6] - input[stride*10];
1591
1592 step[ 6] = temp2 - temp1;
1593 step[ 7] = temp2 + temp1;
1594
1595 // for odd input
1596 temp1 = input[stride*3]*C12;
1597 temp2 = input[stride*13]*C4;
1598 temp1 += temp2;
1599 temp1 = DownshiftMultiply(temp1);
1600 temp1 *= C8;
1601 intermediate[ 8] = DownshiftMultiplyBy2(temp1);
1602
1603 temp1 = input[stride*3]*C4;
1604 temp2 = input[stride*13]*C12;
1605 temp2 -= temp1;
1606 temp2 = DownshiftMultiply(temp2);
1607 temp2 *= C8;
1608 intermediate[ 9] = DownshiftMultiplyBy2(temp2);
1609
1610 intermediate[10] = DownshiftMultiplyBy2(input[stride*9]*C8);
1611 intermediate[11] = input[stride*15] - input[stride*1];
1612 intermediate[12] = input[stride*15] + input[stride*1];
1613 intermediate[13] = DownshiftMultiplyBy2((input[stride*7]*C8));
1614
1615 temp1 = input[stride*11]*C12;
1616 temp2 = input[stride*5]*C4;
1617 temp2 -= temp1;
1618 temp2 = DownshiftMultiply(temp2);
1619 temp2 *= C8;
1620 intermediate[14] = DownshiftMultiplyBy2(temp2);
1621
1622 temp1 = input[stride*11]*C4;
1623 temp2 = input[stride*5]*C12;
1624 temp1 += temp2;
1625 temp1 = DownshiftMultiply(temp1);
1626 temp1 *= C8;
1627 intermediate[15] = DownshiftMultiplyBy2(temp1);
1628
1629 step[ 8] = intermediate[ 8] + intermediate[14];
1630 step[ 9] = intermediate[ 9] + intermediate[15];
1631 step[10] = intermediate[10] + intermediate[11];
1632 step[11] = intermediate[10] - intermediate[11];
1633 step[12] = intermediate[12] + intermediate[13];
1634 step[13] = intermediate[12] - intermediate[13];
1635 step[14] = intermediate[ 8] - intermediate[14];
1636 step[15] = intermediate[ 9] - intermediate[15];
1637
1638 // step 3
1639 output[stride*0] = step[ 0] + step[ 3];
1640 output[stride*1] = step[ 1] + step[ 2];
1641 output[stride*2] = step[ 1] - step[ 2];
1642 output[stride*3] = step[ 0] - step[ 3];
1643
1644 temp1 = step[ 4]*C14;
1645 temp2 = step[ 7]*C2;
1646 temp1 -= temp2;
1647 output[stride*4] = DownshiftMultiply(temp1);
1648
1649 temp1 = step[ 4]*C2;
1650 temp2 = step[ 7]*C14;
1651 temp1 += temp2;
1652 output[stride*7] = DownshiftMultiply(temp1);
1653
1654 temp1 = step[ 5]*C10;
1655 temp2 = step[ 6]*C6;
1656 temp1 -= temp2;
1657 output[stride*5] = DownshiftMultiply(temp1);
1658
1659 temp1 = step[ 5]*C6;
1660 temp2 = step[ 6]*C10;
1661 temp1 += temp2;
1662 output[stride*6] = DownshiftMultiply(temp1);
1663
1664 output[stride*8] = step[ 8] + step[11];
1665 output[stride*9] = step[ 9] + step[10];
1666 output[stride*10] = step[ 9] - step[10];
1667 output[stride*11] = step[ 8] - step[11];
1668 output[stride*12] = step[12] + step[15];
1669 output[stride*13] = step[13] + step[14];
1670 output[stride*14] = step[13] - step[14];
1671 output[stride*15] = step[12] - step[15];
1672
1673 // output 4
1674 step[ 0] = output[stride*0] + output[stride*7];
1675 step[ 1] = output[stride*1] + output[stride*6];
1676 step[ 2] = output[stride*2] + output[stride*5];
1677 step[ 3] = output[stride*3] + output[stride*4];
1678 step[ 4] = output[stride*3] - output[stride*4];
1679 step[ 5] = output[stride*2] - output[stride*5];
1680 step[ 6] = output[stride*1] - output[stride*6];
1681 step[ 7] = output[stride*0] - output[stride*7];
1682
1683 temp1 = output[stride*8]*C7;
1684 temp2 = output[stride*15]*C9;
1685 temp1 -= temp2;
1686 step[ 8] = DownshiftMultiply(temp1);
1687
1688 temp1 = output[stride*9]*C11;
1689 temp2 = output[stride*14]*C5;
1690 temp1 += temp2;
1691 step[ 9] = DownshiftMultiply(temp1);
1692
1693 temp1 = output[stride*10]*C3;
1694 temp2 = output[stride*13]*C13;
1695 temp1 -= temp2;
1696 step[10] = DownshiftMultiply(temp1);
1697
1698 temp1 = output[stride*11]*C15;
1699 temp2 = output[stride*12]*C1;
1700 temp1 += temp2;
1701 step[11] = DownshiftMultiply(temp1);
1702
1703 temp1 = output[stride*11]*C1;
1704 temp2 = output[stride*12]*C15;
1705 temp2 -= temp1;
1706 step[12] = DownshiftMultiply(temp2);
1707
1708 temp1 = output[stride*10]*C13;
1709 temp2 = output[stride*13]*C3;
1710 temp1 += temp2;
1711 step[13] = DownshiftMultiply(temp1);
1712
1713 temp1 = output[stride*9]*C5;
1714 temp2 = output[stride*14]*C11;
1715 temp2 -= temp1;
1716 step[14] = DownshiftMultiply(temp2);
1717
1718 temp1 = output[stride*8]*C9;
1719 temp2 = output[stride*15]*C7;
1720 temp1 += temp2;
1721 step[15] = DownshiftMultiply(temp1);
1722
1723 // step 5
1724 output[stride*0] = step[0] + step[15];
1725 output[stride*1] = step[1] + step[14];
1726 output[stride*2] = step[2] + step[13];
1727 output[stride*3] = step[3] + step[12];
1728 output[stride*4] = step[4] + step[11];
1729 output[stride*5] = step[5] + step[10];
1730 output[stride*6] = step[6] + step[ 9];
1731 output[stride*7] = step[7] + step[ 8];
1732
1733 output[stride*15] = step[0] - step[15];
1734 output[stride*14] = step[1] - step[14];
1735 output[stride*13] = step[2] - step[13];
1736 output[stride*12] = step[3] - step[12];
1737 output[stride*11] = step[4] - step[11];
1738 output[stride*10] = step[5] - step[10];
1739 output[stride*9] = step[6] - step[ 9];
1740 output[stride*8] = step[7] - step[ 8];
1741 }
1742
1743 static void butterfly_32_idct_1d(double *input, double *output, int stride) {
1744 static const double C1 = 0.998795456205; // cos(pi * 1 / 64)
1745 static const double C3 = 0.989176509965; // cos(pi * 3 / 64)
1746 static const double C5 = 0.970031253195; // cos(pi * 5 / 64)
1747 static const double C7 = 0.941544065183; // cos(pi * 7 / 64)
1748 static const double C9 = 0.903989293123; // cos(pi * 9 / 64)
1749 static const double C11 = 0.857728610000; // cos(pi * 11 / 64)
1750 static const double C13 = 0.803207531481; // cos(pi * 13 / 64)
1751 static const double C15 = 0.740951125355; // cos(pi * 15 / 64)
1752 static const double C16 = 0.707106781187; // cos(pi * 16 / 64)
1753 static const double C17 = 0.671558954847; // cos(pi * 17 / 64)
1754 static const double C19 = 0.595699304492; // cos(pi * 19 / 64)
1755 static const double C21 = 0.514102744193; // cos(pi * 21 / 64)
1756 static const double C23 = 0.427555093430; // cos(pi * 23 / 64)
1757 static const double C25 = 0.336889853392; // cos(pi * 25 / 64)
1758 static const double C27 = 0.242980179903; // cos(pi * 27 / 64)
1759 static const double C29 = 0.146730474455; // cos(pi * 29 / 64)
1760 static const double C31 = 0.049067674327; // cos(pi * 31 / 64)
1761
1762 double step1[32];
1763 double step2[32];
1764
1765 step1[ 0] = input[stride*0];
1766 step1[ 1] = input[stride*2];
1767 step1[ 2] = input[stride*4];
1768 step1[ 3] = input[stride*6];
1769 step1[ 4] = input[stride*8];
1770 step1[ 5] = input[stride*10];
1771 step1[ 6] = input[stride*12];
1772 step1[ 7] = input[stride*14];
1773 step1[ 8] = input[stride*16];
1774 step1[ 9] = input[stride*18];
1775 step1[10] = input[stride*20];
1776 step1[11] = input[stride*22];
1777 step1[12] = input[stride*24];
1778 step1[13] = input[stride*26];
1779 step1[14] = input[stride*28];
1780 step1[15] = input[stride*30];
1781
1782 step1[16] = DownshiftMultiplyBy2(input[stride*1]*C16);
1783 step1[17] = (input[stride*3] + input[stride*1]);
1784 step1[18] = (input[stride*5] + input[stride*3]);
1785 step1[19] = (input[stride*7] + input[stride*5]);
1786 step1[20] = (input[stride*9] + input[stride*7]);
1787 step1[21] = (input[stride*11] + input[stride*9]);
1788 step1[22] = (input[stride*13] + input[stride*11]);
1789 step1[23] = (input[stride*15] + input[stride*13]);
1790 step1[24] = (input[stride*17] + input[stride*15]);
1791 step1[25] = (input[stride*19] + input[stride*17]);
1792 step1[26] = (input[stride*21] + input[stride*19]);
1793 step1[27] = (input[stride*23] + input[stride*21]);
1794 step1[28] = (input[stride*25] + input[stride*23]);
1795 step1[29] = (input[stride*27] + input[stride*25]);
1796 step1[30] = (input[stride*29] + input[stride*27]);
1797 step1[31] = (input[stride*31] + input[stride*29]);
1798
1799 idct16(step1, step2, 1);
1800 idct16(step1 + 16, step2 + 16, 1);
1801
1802 step2[16] = DownshiftMultiply(step2[16] / (2*C1));
1803 step2[17] = DownshiftMultiply(step2[17] / (2*C3));
1804 step2[18] = DownshiftMultiply(step2[18] / (2*C5));
1805 step2[19] = DownshiftMultiply(step2[19] / (2*C7));
1806 step2[20] = DownshiftMultiply(step2[20] / (2*C9));
1807 step2[21] = DownshiftMultiply(step2[21] / (2*C11));
1808 step2[22] = DownshiftMultiply(step2[22] / (2*C13));
1809 step2[23] = DownshiftMultiply(step2[23] / (2*C15));
1810 step2[24] = DownshiftMultiply(step2[24] / (2*C17));
1811 step2[25] = DownshiftMultiply(step2[25] / (2*C19));
1812 step2[26] = DownshiftMultiply(step2[26] / (2*C21));
1813 step2[27] = DownshiftMultiply(step2[27] / (2*C23));
1814 step2[28] = DownshiftMultiply(step2[28] / (2*C25));
1815 step2[29] = DownshiftMultiply(step2[29] / (2*C27));
1816 step2[30] = DownshiftMultiply(step2[30] / (2*C29));
1817 step2[31] = DownshiftMultiply(step2[31] / (2*C31));
1818
1819 output[stride* 0] = step2[ 0] + step2[16];
1820 output[stride* 1] = step2[ 1] + step2[17];
1821 output[stride* 2] = step2[ 2] + step2[18];
1822 output[stride* 3] = step2[ 3] + step2[19];
1823 output[stride* 4] = step2[ 4] + step2[20];
1824 output[stride* 5] = step2[ 5] + step2[21];
1825 output[stride* 6] = step2[ 6] + step2[22];
1826 output[stride* 7] = step2[ 7] + step2[23];
1827 output[stride* 8] = step2[ 8] + step2[24];
1828 output[stride* 9] = step2[ 9] + step2[25];
1829 output[stride*10] = step2[10] + step2[26];
1830 output[stride*11] = step2[11] + step2[27];
1831 output[stride*12] = step2[12] + step2[28];
1832 output[stride*13] = step2[13] + step2[29];
1833 output[stride*14] = step2[14] + step2[30];
1834 output[stride*15] = step2[15] + step2[31];
1835 output[stride*16] = step2[15] - step2[(31 - 0)];
1836 output[stride*17] = step2[14] - step2[(31 - 1)];
1837 output[stride*18] = step2[13] - step2[(31 - 2)];
1838 output[stride*19] = step2[12] - step2[(31 - 3)];
1839 output[stride*20] = step2[11] - step2[(31 - 4)];
1840 output[stride*21] = step2[10] - step2[(31 - 5)];
1841 output[stride*22] = step2[ 9] - step2[(31 - 6)];
1842 output[stride*23] = step2[ 8] - step2[(31 - 7)];
1843 output[stride*24] = step2[ 7] - step2[(31 - 8)];
1844 output[stride*25] = step2[ 6] - step2[(31 - 9)];
1845 output[stride*26] = step2[ 5] - step2[(31 - 10)];
1846 output[stride*27] = step2[ 4] - step2[(31 - 11)];
1847 output[stride*28] = step2[ 3] - step2[(31 - 12)];
1848 output[stride*29] = step2[ 2] - step2[(31 - 13)];
1849 output[stride*30] = step2[ 1] - step2[(31 - 14)];
1850 output[stride*31] = step2[ 0] - step2[(31 - 15)];
1851 }
1852
1853 void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
1854 vp9_clear_system_state(); // Make it simd safe : __asm emms;
1855 {
1856 double out[32*32], out2[32*32];
1857 const int short_pitch = pitch >> 1;
1858 int i, j;
1859 // First transform rows
1860 for (i = 0; i < 32; ++i) {
1861 double temp_in[32], temp_out[32];
1862 for (j = 0; j < 32; ++j)
1863 temp_in[j] = input[j + i*short_pitch];
1864 butterfly_32_idct_1d(temp_in, temp_out, 1);
1865 for (j = 0; j < 32; ++j)
1866 out[j + i*32] = temp_out[j];
1867 }
1868 // Then transform columns
1869 for (i = 0; i < 32; ++i) {
1870 double temp_in[32], temp_out[32];
1871 for (j = 0; j < 32; ++j)
1872 temp_in[j] = out[j*32 + i];
1873 butterfly_32_idct_1d(temp_in, temp_out, 1);
1874 for (j = 0; j < 32; ++j)
1875 out2[j*32 + i] = temp_out[j];
1876 }
1877 for (i = 0; i < 32*32; ++i)
1878 output[i] = round(out2[i]/128);
1879 }
1880 vp9_clear_system_state(); // Make it simd safe : __asm emms;
1881 }
1882
1883 #else // !CONFIG_DWTDCTHYBRID
1884
1885 #if DWT_TYPE == 53
1886
1887 // Note: block length must be even for this implementation
1888 static void synthesis_53_row(int length, int16_t *lowpass, int16_t *highpass,
1889 int16_t *x) {
1890 int16_t r, *a, *b;
1891 int n;
1892
1893 n = length >> 1;
1894 b = highpass;
1895 a = lowpass;
1896 r = *highpass;
1897 while (n--) {
1898 *a++ -= (r + (*b) + 1) >> 1;
1899 r = *b++;
1900 }
1901
1902 n = length >> 1;
1903 b = highpass;
1904 a = lowpass;
1905 while (--n) {
1906 *x++ = ((r = *a++) + 1) >> 1;
1907 *x++ = *b++ + ((r + (*a) + 2) >> 2);
1908 }
1909 *x++ = ((r = *a) + 1) >> 1;
1910 *x++ = *b + ((r + 1) >> 1);
1911 }
1912
1913 static void synthesis_53_col(int length, int16_t *lowpass, int16_t *highpass,
1914 int16_t *x) {
1915 int16_t r, *a, *b;
1916 int n;
1917
1918 n = length >> 1;
1919 b = highpass;
1920 a = lowpass;
1921 r = *highpass;
1922 while (n--) {
1923 *a++ -= (r + (*b) + 1) >> 1;
1924 r = *b++;
1925 }
1926
1927 n = length >> 1;
1928 b = highpass;
1929 a = lowpass;
1930 while (--n) {
1931 r = *a++;
1932 *x++ = r;
1933 *x++ = ((*b++) << 1) + ((r + (*a) + 1) >> 1);
1934 }
1935 *x++ = *a;
1936 *x++ = ((*b) << 1) + *a;
1937 }
1938
1939 static void dyadic_synthesize_53(int levels, int width, int height, int16_t *c,
1940 int pitch_c, int16_t *x, int pitch_x) {
1941 int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;
1942 short buffer[2 * DWT_MAX_LENGTH];
1943
1944 th[0] = hh;
1945 tw[0] = hw;
1946 for (i = 1; i <= levels; i++) {
1947 th[i] = (th[i - 1] + 1) >> 1;
1948 tw[i] = (tw[i - 1] + 1) >> 1;
1949 }
1950 for (lv = levels - 1; lv >= 0; lv--) {
1951 nh = th[lv];
1952 nw = tw[lv];
1953 hh = th[lv + 1];
1954 hw = tw[lv + 1];
1955 if ((nh < 2) || (nw < 2)) continue;
1956 for (j = 0; j < nw; j++) {
1957 for (i = 0; i < nh; i++)
1958 buffer[i] = c[i * pitch_c + j];
1959 synthesis_53_col(nh, buffer, buffer + hh, buffer + nh);
1960 for (i = 0; i < nh; i++)
1961 c[i * pitch_c + j] = buffer[i + nh];
1962 }
1963 for (i = 0; i < nh; i++) {
1964 memcpy(buffer, &c[i * pitch_c], nw * sizeof(*buffer));
1965 synthesis_53_row(nw, buffer, buffer + hw, &c[i * pitch_c]);
1966 }
1967 }
1968 for (i = 0; i < height; i++) {
1969 for (j = 0; j < width; j++) {
1970 x[i * pitch_x + j] = c[i * pitch_c + j] >= 0 ?
1971 ((c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS) :
1972 -((-c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS);
1973 }
1974 }
1975 }
1976
1977 #elif DWT_TYPE == 26
1978
1979 // Note: block length must be even for this implementation
1980 static void synthesis_26_row(int length, int16_t *lowpass, int16_t *highpass,
1981 int16_t *x) {
1982 int16_t r, s, *a, *b;
1983 int i, n = length >> 1;
1984
1985 if (n >= 4) {
1986 a = lowpass;
1987 b = highpass;
1988 r = *lowpass;
1989 while (--n) {
1990 *b++ += (r - a[1] + 4) >> 3;
1991 r = *a++;
1992 }
1993 *b += (r - *a + 4) >> 3;
1994 }
1995 a = lowpass;
1996 b = highpass;
1997 for (i = length >> 1; i; i--) {
1998 s = *b++;
1999 r = *a++;
2000 *x++ = (r + s + 1) >> 1;
2001 *x++ = (r - s + 1) >> 1;
2002 }
2003 }
2004
2005 static void synthesis_26_col(int length, int16_t *lowpass, int16_t *highpass,
2006 int16_t *x) {
2007 int16_t r, s, *a, *b;
2008 int i, n = length >> 1;
2009
2010 if (n >= 4) {
2011 a = lowpass;
2012 b = highpass;
2013 r = *lowpass;
2014 while (--n) {
2015 *b++ += (r - a[1] + 4) >> 3;
2016 r = *a++;
2017 }
2018 *b += (r - *a + 4) >> 3;
2019 }
2020 a = lowpass;
2021 b = highpass;
2022 for (i = length >> 1; i; i--) {
2023 s = *b++;
2024 r = *a++;
2025 *x++ = r + s;
2026 *x++ = r - s;
2027 }
2028 }
2029
2030 static void dyadic_synthesize_26(int levels, int width, int height, int16_t *c,
2031 int pitch_c, int16_t *x, int pitch_x) {
2032 int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;
2033 int16_t buffer[2 * DWT_MAX_LENGTH];
2034
2035 th[0] = hh;
2036 tw[0] = hw;
2037 for (i = 1; i <= levels; i++) {
2038 th[i] = (th[i - 1] + 1) >> 1;
2039 tw[i] = (tw[i - 1] + 1) >> 1;
2040 }
2041 for (lv = levels - 1; lv >= 0; lv--) {
2042 nh = th[lv];
2043 nw = tw[lv];
2044 hh = th[lv + 1];
2045 hw = tw[lv + 1];
2046 if ((nh < 2) || (nw < 2)) continue;
2047 for (j = 0; j < nw; j++) {
2048 for (i = 0; i < nh; i++)
2049 buffer[i] = c[i * pitch_c + j];
2050 synthesis_26_col(nh, buffer, buffer + hh, buffer + nh);
2051 for (i = 0; i < nh; i++)
2052 c[i * pitch_c + j] = buffer[i + nh];
2053 }
2054 for (i = 0; i < nh; i++) {
2055 memcpy(buffer, &c[i * pitch_c], nw * sizeof(*buffer));
2056 synthesis_26_row(nw, buffer, buffer + hw, &c[i * pitch_c]);
2057 }
2058 }
2059 for (i = 0; i < height; i++) {
2060 for (j = 0; j < width; j++) {
2061 x[i * pitch_x + j] = c[i * pitch_c + j] >= 0 ?
2062 ((c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS) :
2063 -((-c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS);
2064 }
2065 }
2066 }
2067
2068 #elif DWT_TYPE == 97
2069
2070 static void synthesis_97(int length, double *lowpass, double *highpass,
2071 double *x) {
2072 static const double a_predict1 = -1.586134342;
2073 static const double a_update1 = -0.05298011854;
2074 static const double a_predict2 = 0.8829110762;
2075 static const double a_update2 = 0.4435068522;
2076 static const double s_low = 1.149604398;
2077 static const double s_high = 1/1.149604398;
2078 static const double inv_s_low = 1 / s_low;
2079 static const double inv_s_high = 1 / s_high;
2080 int i;
2081 double y[DWT_MAX_LENGTH];
2082 // Undo pack and scale
2083 for (i = 0; i < length / 2; i++) {
2084 y[i * 2] = lowpass[i] * inv_s_low;
2085 y[i * 2 + 1] = highpass[i] * inv_s_high;
2086 }
2087 memcpy(x, y, sizeof(*y) * length);
2088 // Undo update 2
2089 for (i = 2; i < length; i += 2) {
2090 x[i] -= a_update2 * (x[i-1] + x[i+1]);
2091 }
2092 x[0] -= 2 * a_update2 * x[1];
2093 // Undo predict 2
2094 for (i = 1; i < length - 2; i += 2) {
2095 x[i] -= a_predict2 * (x[i - 1] + x[i + 1]);
2096 }
2097 x[length - 1] -= 2 * a_predict2 * x[length - 2];
2098 // Undo update 1
2099 for (i = 2; i < length; i += 2) {
2100 x[i] -= a_update1 * (x[i - 1] + x[i + 1]);
2101 }
2102 x[0] -= 2 * a_update1 * x[1];
2103 // Undo predict 1
2104 for (i = 1; i < length - 2; i += 2) {
2105 x[i] -= a_predict1 * (x[i - 1] + x[i + 1]);
2106 }
2107 x[length - 1] -= 2 * a_predict1 * x[length - 2];
2108 }
2109
2110 static void dyadic_synthesize_97(int levels, int width, int height, int16_t *c,
2111 int pitch_c, int16_t *x, int pitch_x) {
2112 int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;
2113 double buffer[2 * DWT_MAX_LENGTH];
2114 double y[DWT_MAX_LENGTH * DWT_MAX_LENGTH];
2115
2116 th[0] = hh;
2117 tw[0] = hw;
2118 for (i = 1; i <= levels; i++) {
2119 th[i] = (th[i - 1] + 1) >> 1;
2120 tw[i] = (tw[i - 1] + 1) >> 1;
2121 }
2122 for (lv = levels - 1; lv >= 0; lv--) {
2123 nh = th[lv];
2124 nw = tw[lv];
2125 hh = th[lv + 1];
2126 hw = tw[lv + 1];
2127 if ((nh < 2) || (nw < 2)) continue;
2128 for (j = 0; j < nw; j++) {
2129 for (i = 0; i < nh; i++)
2130 buffer[i] = c[i * pitch_c + j];
2131 synthesis_97(nh, buffer, buffer + hh, buffer + nh);
2132 for (i = 0; i < nh; i++)
2133 y[i * DWT_MAX_LENGTH + j] = buffer[i + nh];
2134 }
2135 for (i = 0; i < nh; i++) {
2136 memcpy(buffer, &y[i * DWT_MAX_LENGTH], nw * sizeof(*buffer));
2137 synthesis_97(nw, buffer, buffer + hw, &y[i * DWT_MAX_LENGTH]);
2138 }
2139 }
2140 for (i = 0; i < height; i++)
2141 for (j = 0; j < width; j++)
2142 x[i * pitch_x + j] = round(y[i * DWT_MAX_LENGTH + j] /
2143 (1 << DWT_PRECISION_BITS));
2144 }
2145
2146 #endif // DWT_TYPE
2147
2148 // TODO(debargha): Implement scaling differently so as not to have to use the
2149 // floating point 16x16 dct
2150 static void butterfly_16x16_idct_1d_f(double input[16], double output[16]) {
2151 static const double C1 = 0.995184726672197;
2152 static const double C2 = 0.98078528040323;
2153 static const double C3 = 0.956940335732209;
2154 static const double C4 = 0.923879532511287;
2155 static const double C5 = 0.881921264348355;
2156 static const double C6 = 0.831469612302545;
2157 static const double C7 = 0.773010453362737;
2158 static const double C8 = 0.707106781186548;
2159 static const double C9 = 0.634393284163646;
2160 static const double C10 = 0.555570233019602;
2161 static const double C11 = 0.471396736825998;
2162 static const double C12 = 0.38268343236509;
2163 static const double C13 = 0.290284677254462;
2164 static const double C14 = 0.195090322016128;
2165 static const double C15 = 0.098017140329561;
2166
2167 vp9_clear_system_state(); // Make it simd safe : __asm emms;
2168 {
2169 double step[16];
2170 double intermediate[16];
2171 double temp1, temp2;
2172
2173
2174 // step 1 and 2
2175 step[ 0] = input[0] + input[8];
2176 step[ 1] = input[0] - input[8];
2177
2178 temp1 = input[4]*C12;
2179 temp2 = input[12]*C4;
2180
2181 temp1 -= temp2;
2182 temp1 *= C8;
2183
2184 step[ 2] = 2*(temp1);
2185
2186 temp1 = input[4]*C4;
2187 temp2 = input[12]*C12;
2188 temp1 += temp2;
2189 temp1 = (temp1);
2190 temp1 *= C8;
2191 step[ 3] = 2*(temp1);
2192
2193 temp1 = input[2]*C8;
2194 temp1 = 2*(temp1);
2195 temp2 = input[6] + input[10];
2196
2197 step[ 4] = temp1 + temp2;
2198 step[ 5] = temp1 - temp2;
2199
2200 temp1 = input[14]*C8;
2201 temp1 = 2*(temp1);
2202 temp2 = input[6] - input[10];
2203
2204 step[ 6] = temp2 - temp1;
2205 step[ 7] = temp2 + temp1;
2206
2207 // for odd input
2208 temp1 = input[3]*C12;
2209 temp2 = input[13]*C4;
2210 temp1 += temp2;
2211 temp1 = (temp1);
2212 temp1 *= C8;
2213 intermediate[ 8] = 2*(temp1);
2214
2215 temp1 = input[3]*C4;
2216 temp2 = input[13]*C12;
2217 temp2 -= temp1;
2218 temp2 = (temp2);
2219 temp2 *= C8;
2220 intermediate[ 9] = 2*(temp2);
2221
2222 intermediate[10] = 2*(input[9]*C8);
2223 intermediate[11] = input[15] - input[1];
2224 intermediate[12] = input[15] + input[1];
2225 intermediate[13] = 2*((input[7]*C8));
2226
2227 temp1 = input[11]*C12;
2228 temp2 = input[5]*C4;
2229 temp2 -= temp1;
2230 temp2 = (temp2);
2231 temp2 *= C8;
2232 intermediate[14] = 2*(temp2);
2233
2234 temp1 = input[11]*C4;
2235 temp2 = input[5]*C12;
2236 temp1 += temp2;
2237 temp1 = (temp1);
2238 temp1 *= C8;
2239 intermediate[15] = 2*(temp1);
2240
2241 step[ 8] = intermediate[ 8] + intermediate[14];
2242 step[ 9] = intermediate[ 9] + intermediate[15];
2243 step[10] = intermediate[10] + intermediate[11];
2244 step[11] = intermediate[10] - intermediate[11];
2245 step[12] = intermediate[12] + intermediate[13];
2246 step[13] = intermediate[12] - intermediate[13];
2247 step[14] = intermediate[ 8] - intermediate[14];
2248 step[15] = intermediate[ 9] - intermediate[15];
2249
2250 // step 3
2251 output[0] = step[ 0] + step[ 3];
2252 output[1] = step[ 1] + step[ 2];
2253 output[2] = step[ 1] - step[ 2];
2254 output[3] = step[ 0] - step[ 3];
2255
2256 temp1 = step[ 4]*C14;
2257 temp2 = step[ 7]*C2;
2258 temp1 -= temp2;
2259 output[4] = (temp1);
2260
2261 temp1 = step[ 4]*C2;
2262 temp2 = step[ 7]*C14;
2263 temp1 += temp2;
2264 output[7] = (temp1);
2265
2266 temp1 = step[ 5]*C10;
2267 temp2 = step[ 6]*C6;
2268 temp1 -= temp2;
2269 output[5] = (temp1);
2270
2271 temp1 = step[ 5]*C6;
2272 temp2 = step[ 6]*C10;
2273 temp1 += temp2;
2274 output[6] = (temp1);
2275
2276 output[8] = step[ 8] + step[11];
2277 output[9] = step[ 9] + step[10];
2278 output[10] = step[ 9] - step[10];
2279 output[11] = step[ 8] - step[11];
2280 output[12] = step[12] + step[15];
2281 output[13] = step[13] + step[14];
2282 output[14] = step[13] - step[14];
2283 output[15] = step[12] - step[15];
2284
2285 // output 4
2286 step[ 0] = output[0] + output[7];
2287 step[ 1] = output[1] + output[6];
2288 step[ 2] = output[2] + output[5];
2289 step[ 3] = output[3] + output[4];
2290 step[ 4] = output[3] - output[4];
2291 step[ 5] = output[2] - output[5];
2292 step[ 6] = output[1] - output[6];
2293 step[ 7] = output[0] - output[7];
2294
2295 temp1 = output[8]*C7;
2296 temp2 = output[15]*C9;
2297 temp1 -= temp2;
2298 step[ 8] = (temp1);
2299
2300 temp1 = output[9]*C11;
2301 temp2 = output[14]*C5;
2302 temp1 += temp2;
2303 step[ 9] = (temp1);
2304
2305 temp1 = output[10]*C3;
2306 temp2 = output[13]*C13;
2307 temp1 -= temp2;
2308 step[10] = (temp1);
2309
2310 temp1 = output[11]*C15;
2311 temp2 = output[12]*C1;
2312 temp1 += temp2;
2313 step[11] = (temp1);
2314
2315 temp1 = output[11]*C1;
2316 temp2 = output[12]*C15;
2317 temp2 -= temp1;
2318 step[12] = (temp2);
2319
2320 temp1 = output[10]*C13;
2321 temp2 = output[13]*C3;
2322 temp1 += temp2;
2323 step[13] = (temp1);
2324
2325 temp1 = output[9]*C5;
2326 temp2 = output[14]*C11;
2327 temp2 -= temp1;
2328 step[14] = (temp2);
2329
2330 temp1 = output[8]*C9;
2331 temp2 = output[15]*C7;
2332 temp1 += temp2;
2333 step[15] = (temp1);
2334
2335 // step 5
2336 output[0] = (step[0] + step[15]);
2337 output[1] = (step[1] + step[14]);
2338 output[2] = (step[2] + step[13]);
2339 output[3] = (step[3] + step[12]);
2340 output[4] = (step[4] + step[11]);
2341 output[5] = (step[5] + step[10]);
2342 output[6] = (step[6] + step[ 9]);
2343 output[7] = (step[7] + step[ 8]);
2344
2345 output[15] = (step[0] - step[15]);
2346 output[14] = (step[1] - step[14]);
2347 output[13] = (step[2] - step[13]);
2348 output[12] = (step[3] - step[12]);
2349 output[11] = (step[4] - step[11]);
2350 output[10] = (step[5] - step[10]);
2351 output[9] = (step[6] - step[ 9]);
2352 output[8] = (step[7] - step[ 8]);
2353 }
2354 vp9_clear_system_state(); // Make it simd safe : __asm emms;
2355 }
2356
2357 static void vp9_short_idct16x16_c_f(int16_t *input, int16_t *output, int pitch,
2358 int scale) {
2359 vp9_clear_system_state(); // Make it simd safe : __asm emms;
2360 {
2361 double out[16*16], out2[16*16];
2362 const int short_pitch = pitch >> 1;
2363 int i, j;
2364 // First transform rows
2365 for (i = 0; i < 16; ++i) {
2366 double temp_in[16], temp_out[16];
2367 for (j = 0; j < 16; ++j)
2368 temp_in[j] = input[j + i*short_pitch];
2369 butterfly_16x16_idct_1d_f(temp_in, temp_out);
2370 for (j = 0; j < 16; ++j)
2371 out[j + i*16] = temp_out[j];
2372 }
2373 // Then transform columns
2374 for (i = 0; i < 16; ++i) {
2375 double temp_in[16], temp_out[16];
2376 for (j = 0; j < 16; ++j)
2377 temp_in[j] = out[j*16 + i];
2378 butterfly_16x16_idct_1d_f(temp_in, temp_out);
2379 for (j = 0; j < 16; ++j)
2380 out2[j*16 + i] = temp_out[j];
2381 }
2382 for (i = 0; i < 16*16; ++i)
2383 output[i] = round(out2[i] / (128 >> scale));
2384 }
2385 vp9_clear_system_state(); // Make it simd safe : __asm emms;
2386 }
2387
2388 static void idct8_1d(double *x) {
2389 int i, j;
2390 double t[8];
2391 static const double idctmat[64] = {
2392 0.35355339059327, 0.49039264020162, 0.46193976625564, 0.41573480615127,
2393 0.35355339059327, 0.2777851165098, 0.19134171618254, 0.097545161008064,
2394 0.35355339059327, 0.41573480615127, 0.19134171618254, -0.097545161008064,
2395 -0.35355339059327, -0.49039264020161, -0.46193976625564, -0.2777851165098,
2396 0.35355339059327, 0.2777851165098, -0.19134171618254, -0.49039264020162,
2397 -0.35355339059327, 0.097545161008064, 0.46193976625564, 0.41573480615127,
2398 0.35355339059327, 0.097545161008063, -0.46193976625564, -0.2777851165098,
2399 0.35355339059327, 0.41573480615127, -0.19134171618254, -0.49039264020162,
2400 0.35355339059327, -0.097545161008063, -0.46193976625564, 0.2777851165098,
2401 0.35355339059327, -0.41573480615127, -0.19134171618255, 0.49039264020162,
2402 0.35355339059327, -0.2777851165098, -0.19134171618254, 0.49039264020161,
2403 -0.35355339059327, -0.097545161008064, 0.46193976625564, -0.41573480615127,
2404 0.35355339059327, -0.41573480615127, 0.19134171618254, 0.097545161008065,
2405 -0.35355339059327, 0.49039264020162, -0.46193976625564, 0.2777851165098,
2406 0.35355339059327, -0.49039264020162, 0.46193976625564, -0.41573480615127,
2407 0.35355339059327, -0.2777851165098, 0.19134171618255, -0.097545161008064
2408 };
2409 for (i = 0; i < 8; ++i) {
2410 t[i] = 0;
2411 for (j = 0; j < 8; ++j)
2412 t[i] += idctmat[i * 8 + j] * x[j];
2413 }
2414 for (i = 0; i < 8; ++i) {
2415 x[i] = t[i];
2416 }
2417 }
2418
2419 static void vp9_short_idct8x8_c_f(int16_t *coefs, int16_t *block, int pitch,
2420 int scale) {
2421 double X[8 * 8], Y[8];
2422 int i, j;
2423 int shortpitch = pitch >> 1;
2424
2425 vp9_clear_system_state(); // Make it simd safe : __asm emms;
2426 {
2427 for (i = 0; i < 8; i++) {
2428 for (j = 0; j < 8; j++) {
2429 X[i * 8 + j] = (double)coefs[i * shortpitch + j];
2430 }
2431 }
2432 for (i = 0; i < 8; i++)
2433 idct8_1d(X + 8 * i);
2434 for (i = 0; i < 8; i++) {
2435 for (j = 0; j < 8; ++j)
2436 Y[j] = X[i + 8 * j];
2437 idct8_1d(Y);
2438 for (j = 0; j < 8; ++j)
2439 X[i + 8 * j] = Y[j];
2440 }
2441 for (i = 0; i < 8; i++) {
2442 for (j = 0; j < 8; j++) {
2443 block[i * 8 + j] = (int16_t)round(X[i * 8 + j] / (8 >> scale));
2444 }
2445 }
2446 }
2447 vp9_clear_system_state(); // Make it simd safe : __asm emms;
2448 }
2449
2450 #define multiply_bits(d, n) ((n) < 0 ? (d) >> (n) : (d) << (n))
2451
2452 #if DWTDCT_TYPE == DWTDCT16X16_LEAN
2453
2454 void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
2455 // assume output is a 32x32 buffer
2456 // Temporary buffer to hold a 16x16 block for 16x16 inverse dct
2457 int16_t buffer[16 * 16];
2458 // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt
2459 int16_t buffer2[32 * 32];
2460 // Note: pitch is in bytes, short_pitch is in short units
2461 const int short_pitch = pitch >> 1;
2462 int i, j;
2463
2464 // TODO(debargha): Implement more efficiently by adding output pitch
2465 // argument to the idct16x16 function
2466 vp9_short_idct16x16_c_f(input, buffer, pitch,
2467 1 + DWT_PRECISION_BITS);
2468 for (i = 0; i < 16; ++i) {
2469 vpx_memcpy(buffer2 + i * 32, buffer + i * 16, sizeof(*buffer2) * 16);
2470 }
2471 for (i = 0; i < 16; ++i) {
2472 for (j = 16; j < 32; ++j) {
2473 buffer2[i * 32 + j] =
2474 multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2);
2475 }
2476 }
2477 for (i = 16; i < 32; ++i) {
2478 for (j = 0; j < 32; ++j) {
2479 buffer2[i * 32 + j] =
2480 multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2);
2481 }
2482 }
2483 #if DWT_TYPE == 26
2484 dyadic_synthesize_26(1, 32, 32, buffer2, 32, output, 32);
2485 #elif DWT_TYPE == 97
2486 dyadic_synthesize_97(1, 32, 32, buffer2, 32, output, 32);
2487 #elif DWT_TYPE == 53
2488 dyadic_synthesize_53(1, 32, 32, buffer2, 32, output, 32);
2489 #endif
2490 }
2491
2492 #elif DWTDCT_TYPE == DWTDCT16X16
2493
2494 void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
2495 // assume output is a 32x32 buffer
2496 // Temporary buffer to hold a 16x16 block for 16x16 inverse dct
2497 int16_t buffer[16 * 16];
2498 // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt
2499 int16_t buffer2[32 * 32];
2500 // Note: pitch is in bytes, short_pitch is in short units
2501 const int short_pitch = pitch >> 1;
2502 int i, j;
2503
2504 // TODO(debargha): Implement more efficiently by adding output pitch
2505 // argument to the idct16x16 function
2506 vp9_short_idct16x16_c_f(input, buffer, pitch,
2507 1 + DWT_PRECISION_BITS);
2508 for (i = 0; i < 16; ++i) {
2509 vpx_memcpy(buffer2 + i * 32, buffer + i * 16, sizeof(*buffer2) * 16);
2510 }
2511 vp9_short_idct16x16_c_f(input + 16, buffer, pitch,
2512 1 + DWT_PRECISION_BITS);
2513 for (i = 0; i < 16; ++i) {
2514 vpx_memcpy(buffer2 + i * 32 + 16, buffer + i * 16, sizeof(*buffer2) * 16);
2515 }
2516 vp9_short_idct16x16_c_f(input + 16 * short_pitch, buffer, pitch,
2517 1 + DWT_PRECISION_BITS);
2518 for (i = 0; i < 16; ++i) {
2519 vpx_memcpy(buffer2 + i * 32 + 16 * 32, buffer + i * 16,
2520 sizeof(*buffer2) * 16);
2521 }
2522 vp9_short_idct16x16_c_f(input + 16 * short_pitch + 16, buffer, pitch,
2523 1 + DWT_PRECISION_BITS);
2524 for (i = 0; i < 16; ++i) {
2525 vpx_memcpy(buffer2 + i * 32 + 16 * 33, buffer + i * 16,
2526 sizeof(*buffer2) * 16);
2527 }
2528 #if DWT_TYPE == 26
2529 dyadic_synthesize_26(1, 32, 32, buffer2, 32, output, 32);
2530 #elif DWT_TYPE == 97
2531 dyadic_synthesize_97(1, 32, 32, buffer2, 32, output, 32);
2532 #elif DWT_TYPE == 53
2533 dyadic_synthesize_53(1, 32, 32, buffer2, 32, output, 32);
2534 #endif
2535 }
2536
2537 #elif DWTDCT_TYPE == DWTDCT8X8
2538
2539 void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
2540 // assume output is a 32x32 buffer
2541 // Temporary buffer to hold a 16x16 block for 16x16 inverse dct
2542 int16_t buffer[8 * 8];
2543 // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt
2544 int16_t buffer2[32 * 32];
2545 // Note: pitch is in bytes, short_pitch is in short units
2546 const int short_pitch = pitch >> 1;
2547 int i, j;
2548
2549 // TODO(debargha): Implement more efficiently by adding output pitch
2550 // argument to the idct16x16 function
2551 vp9_short_idct8x8_c_f(input, buffer, pitch,
2552 1 + DWT_PRECISION_BITS);
2553 for (i = 0; i < 8; ++i) {
2554 vpx_memcpy(buffer2 + i * 32, buffer + i * 8, sizeof(*buffer2) * 8);
2555 }
2556 vp9_short_idct8x8_c_f(input + 8, buffer, pitch,
2557 1 + DWT_PRECISION_BITS);
2558 for (i = 0; i < 8; ++i) {
2559 vpx_memcpy(buffer2 + i * 32 + 8, buffer + i * 8, sizeof(*buffer2) * 8);
2560 }
2561 vp9_short_idct8x8_c_f(input + 8 * short_pitch, buffer, pitch,
2562 1 + DWT_PRECISION_BITS);
2563 for (i = 0; i < 8; ++i) {
2564 vpx_memcpy(buffer2 + i * 32 + 8 * 32, buffer + i * 8,
2565 sizeof(*buffer2) * 8);
2566 }
2567 vp9_short_idct8x8_c_f(input + 8 * short_pitch + 8, buffer, pitch,
2568 1 + DWT_PRECISION_BITS);
2569 for (i = 0; i < 8; ++i) {
2570 vpx_memcpy(buffer2 + i * 32 + 8 * 33, buffer + i * 8,
2571 sizeof(*buffer2) * 8);
2572 }
2573 for (i = 0; i < 16; ++i) {
2574 for (j = 16; j < 32; ++j) {
2575 buffer2[i * 32 + j] =
2576 multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2);
2577 }
2578 }
2579 for (i = 16; i < 32; ++i) {
2580 for (j = 0; j < 32; ++j) {
2581 buffer2[i * 32 + j] =
2582 multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2);
2583 }
2584 }
2585 #if DWT_TYPE == 26
2586 dyadic_synthesize_26(2, 32, 32, buffer2, 32, output, 32);
2587 #elif DWT_TYPE == 97
2588 dyadic_synthesize_97(2, 32, 32, buffer2, 32, output, 32);
2589 #elif DWT_TYPE == 53
2590 dyadic_synthesize_53(2, 32, 32, buffer2, 32, output, 32);
2591 #endif
2592 }
2593
2594 #endif
2595
2596 #if CONFIG_TX64X64
2597 void vp9_short_idct64x64_c(int16_t *input, int16_t *output, int pitch) {
2598 // assume output is a 64x64 buffer
2599 // Temporary buffer to hold a 16x16 block for 16x16 inverse dct
2600 int16_t buffer[16 * 16];
2601 // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt
2602 int16_t buffer2[64 * 64];
2603 // Note: pitch is in bytes, short_pitch is in short units
2604 const int short_pitch = pitch >> 1;
2605 int i, j;
2606
2607 // TODO(debargha): Implement more efficiently by adding output pitch
2608 // argument to the idct16x16 function
2609 vp9_short_idct16x16_c_f(input, buffer, pitch,
2610 2 + DWT_PRECISION_BITS);
2611 for (i = 0; i < 16; ++i) {
2612 vpx_memcpy(buffer2 + i * 64, buffer + i * 16, sizeof(*buffer2) * 16);
2613 }
2614 #if DWTDCT_TYPE == DWTDCT16X16_LEAN
2615 for (i = 0; i < 16; ++i) {
2616 for (j = 16; j < 64; ++j) {
2617 buffer2[i * 64 + j] =
2618 multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1);
2619 }
2620 }
2621 for (i = 16; i < 64; ++i) {
2622 for (j = 0; j < 64; ++j) {
2623 buffer2[i * 64 + j] =
2624 multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1);
2625 }
2626 }
2627 #elif DWTDCT_TYPE == DWTDCT16X16
2628 vp9_short_idct16x16_c_f(input + 16, buffer, pitch,
2629 2 + DWT_PRECISION_BITS);
2630 for (i = 0; i < 16; ++i) {
2631 vpx_memcpy(buffer2 + i * 64 + 16, buffer + i * 16, sizeof(*buffer2) * 16);
2632 }
2633 vp9_short_idct16x16_c_f(input + 16 * short_pitch, buffer, pitch,
2634 2 + DWT_PRECISION_BITS);
2635 for (i = 0; i < 16; ++i) {
2636 vpx_memcpy(buffer2 + i * 64 + 16 * 64, buffer + i * 16,
2637 sizeof(*buffer2) * 16);
2638 }
2639 vp9_short_idct16x16_c_f(input + 16 * short_pitch + 16, buffer, pitch,
2640 2 + DWT_PRECISION_BITS);
2641 for (i = 0; i < 16; ++i) {
2642 vpx_memcpy(buffer2 + i * 64 + 16 * 65, buffer + i * 16,
2643 sizeof(*buffer2) * 16);
2644 }
2645
2646 // Copying and scaling highest bands into buffer2
2647 for (i = 0; i < 32; ++i) {
2648 for (j = 32; j < 64; ++j) {
2649 buffer2[i * 64 + j] =
2650 multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1);
2651 }
2652 }
2653 for (i = 32; i < 64; ++i) {
2654 for (j = 0; j < 64; ++j) {
2655 buffer2[i * 64 + j] =
2656 multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1);
2657 }
2658 }
2659 #endif // DWTDCT_TYPE
2660
2661 #if DWT_TYPE == 26
2662 dyadic_synthesize_26(2, 64, 64, buffer2, 64, output, 64);
2663 #elif DWT_TYPE == 97
2664 dyadic_synthesize_97(2, 64, 64, buffer2, 64, output, 64);
2665 #elif DWT_TYPE == 53
2666 dyadic_synthesize_53(2, 64, 64, buffer2, 64, output, 64);
2667 #endif
2668 }
2669 #endif // CONFIG_TX64X64
2670 #endif // !CONFIG_DWTDCTHYBRID
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/vp9_header.h ('k') | source/libvpx/vp9/common/vp9_implicit_segmentation.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698