source/libvpx/vp9/encoder/vp9_dct.c - Issue 11974002: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/encoder/vp9_dct.c

Issue 11974002: libvpx: Pull from upstream (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 7 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11	11

12 #include <assert.h>	12 #include <assert.h>

13 #include <math.h>	13 #include <math.h>

14 #include "vpx_ports/config.h"	14 #include "./vpx_config.h"

15 #include "vp9/common/vp9_systemdependent.h"	15 #include "vp9/common/vp9_systemdependent.h"

16	16

17 #include "vp9/common/vp9_blockd.h"	17 #include "vp9/common/vp9_blockd.h"

18	18

19 // TODO: these transforms can be converted into integer forms to reduce	19 // TODO: these transforms can be converted into integer forms to reduce

20 // the complexity	20 // the complexity

21 static const float dct_4[16] = {	21 static const float dct_4[16] = {

22 0.500000000000000, 0.500000000000000, 0.500000000000000, 0.500000000000000,	22 0.500000000000000, 0.500000000000000, 0.500000000000000, 0.500000000000000,

23 0.653281482438188, 0.270598050073099, -0.270598050073099, -0.653281482438188,	23 0.653281482438188, 0.270598050073099, -0.270598050073099, -0.653281482438188,

24 0.500000000000000, -0.500000000000000, -0.500000000000000, 0.500000000000000,	24 0.500000000000000, -0.500000000000000, -0.500000000000000, 0.500000000000000,

(...skipping 870 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
895 }	895 }

896	896

897 void vp9_short_walsh8x4_x8_c(short input, short output, int pitch) {	897 void vp9_short_walsh8x4_x8_c(short input, short output, int pitch) {

898 vp9_short_walsh4x4_x8_c(input, output, pitch);	898 vp9_short_walsh4x4_x8_c(input, output, pitch);

899 vp9_short_walsh4x4_x8_c(input + 4, output + 16, pitch);	899 vp9_short_walsh4x4_x8_c(input + 4, output + 16, pitch);

900 }	900 }

901 #endif	901 #endif

902	902

903 #define TEST_INT_16x16_DCT 1	903 #define TEST_INT_16x16_DCT 1

904 #if !TEST_INT_16x16_DCT	904 #if !TEST_INT_16x16_DCT

905 static const double C1 = 0.995184726672197;

906 static const double C2 = 0.98078528040323;

907 static const double C3 = 0.956940335732209;

908 static const double C4 = 0.923879532511287;

909 static const double C5 = 0.881921264348355;

910 static const double C6 = 0.831469612302545;

911 static const double C7 = 0.773010453362737;

912 static const double C8 = 0.707106781186548;

913 static const double C9 = 0.634393284163646;

914 static const double C10 = 0.555570233019602;

915 static const double C11 = 0.471396736825998;

916 static const double C12 = 0.38268343236509;

917 static const double C13 = 0.290284677254462;

918 static const double C14 = 0.195090322016128;

919 static const double C15 = 0.098017140329561;

920	905

921 static void dct16x16_1d(double input[16], double output[16]) {	906 static void dct16x16_1d(double input[16], double output[16]) {

	907 static const double C1 = 0.995184726672197;

	908 static const double C2 = 0.98078528040323;

	909 static const double C3 = 0.956940335732209;

	910 static const double C4 = 0.923879532511287;

	911 static const double C5 = 0.881921264348355;

	912 static const double C6 = 0.831469612302545;

	913 static const double C7 = 0.773010453362737;

	914 static const double C8 = 0.707106781186548;

	915 static const double C9 = 0.634393284163646;

	916 static const double C10 = 0.555570233019602;

	917 static const double C11 = 0.471396736825998;

	918 static const double C12 = 0.38268343236509;

	919 static const double C13 = 0.290284677254462;

	920 static const double C14 = 0.195090322016128;

	921 static const double C15 = 0.098017140329561;

	922

922 vp9_clear_system_state(); // Make it simd safe : __asm emms;	923 vp9_clear_system_state(); // Make it simd safe : __asm emms;

923 {	924 {

924 double step[16];	925 double step[16];

925 double intermediate[16];	926 double intermediate[16];

926 double temp1, temp2;	927 double temp1, temp2;

927	928

928 // step 1	929 // step 1

929 step[ 0] = input[0] + input[15];	930 step[ 0] = input[0] + input[15];

930 step[ 1] = input[1] + input[14];	931 step[ 1] = input[1] + input[14];

931 step[ 2] = input[2] + input[13];	932 step[ 2] = input[2] + input[13];

(...skipping 391 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1323 // Then transform rows	1324 // Then transform rows

1324 for (i = 0; i < 16; ++i) {	1325 for (i = 0; i < 16; ++i) {

1325 dct16x16_1d(outptr, out, 1);	1326 dct16x16_1d(outptr, out, 1);

1326 outptr += 16;	1327 outptr += 16;

1327 out += 16;	1328 out += 16;

1328 }	1329 }

1329 }	1330 }

1330 #undef RIGHT_SHIFT	1331 #undef RIGHT_SHIFT

1331 #undef ROUNDING	1332 #undef ROUNDING

1332 #endif	1333 #endif

	1334

	1335 #if !CONFIG_DWTDCTHYBRID

	1336 static void dct32_1d(double input, double output, int stride) {

	1337 static const double C1 = 0.998795456205; // cos(pi * 1 / 64)

	1338 static const double C2 = 0.995184726672; // cos(pi * 2 / 64)

	1339 static const double C3 = 0.989176509965; // cos(pi * 3 / 64)

	1340 static const double C4 = 0.980785280403; // cos(pi * 4 / 64)

	1341 static const double C5 = 0.970031253195; // cos(pi * 5 / 64)

	1342 static const double C6 = 0.956940335732; // cos(pi * 6 / 64)

	1343 static const double C7 = 0.941544065183; // cos(pi * 7 / 64)

	1344 static const double C8 = 0.923879532511; // cos(pi * 8 / 64)

	1345 static const double C9 = 0.903989293123; // cos(pi * 9 / 64)

	1346 static const double C10 = 0.881921264348; // cos(pi * 10 / 64)

	1347 static const double C11 = 0.857728610000; // cos(pi * 11 / 64)

	1348 static const double C12 = 0.831469612303; // cos(pi * 12 / 64)

	1349 static const double C13 = 0.803207531481; // cos(pi * 13 / 64)

	1350 static const double C14 = 0.773010453363; // cos(pi * 14 / 64)

	1351 static const double C15 = 0.740951125355; // cos(pi * 15 / 64)

	1352 static const double C16 = 0.707106781187; // cos(pi * 16 / 64)

	1353 static const double C17 = 0.671558954847; // cos(pi * 17 / 64)

	1354 static const double C18 = 0.634393284164; // cos(pi * 18 / 64)

	1355 static const double C19 = 0.595699304492; // cos(pi * 19 / 64)

	1356 static const double C20 = 0.555570233020; // cos(pi * 20 / 64)

	1357 static const double C21 = 0.514102744193; // cos(pi * 21 / 64)

	1358 static const double C22 = 0.471396736826; // cos(pi * 22 / 64)

	1359 static const double C23 = 0.427555093430; // cos(pi * 23 / 64)

	1360 static const double C24 = 0.382683432365; // cos(pi * 24 / 64)

	1361 static const double C25 = 0.336889853392; // cos(pi * 25 / 64)

	1362 static const double C26 = 0.290284677254; // cos(pi * 26 / 64)

	1363 static const double C27 = 0.242980179903; // cos(pi * 27 / 64)

	1364 static const double C28 = 0.195090322016; // cos(pi * 28 / 64)

	1365 static const double C29 = 0.146730474455; // cos(pi * 29 / 64)

	1366 static const double C30 = 0.098017140330; // cos(pi * 30 / 64)

	1367 static const double C31 = 0.049067674327; // cos(pi * 31 / 64)

	1368

	1369 double step[32];

	1370

	1371 // Stage 1

	1372 step[0] = input[stride0] + input[stride(32 - 1)];

	1373 step[1] = input[stride1] + input[stride(32 - 2)];

	1374 step[2] = input[stride2] + input[stride(32 - 3)];

	1375 step[3] = input[stride3] + input[stride(32 - 4)];

	1376 step[4] = input[stride4] + input[stride(32 - 5)];

	1377 step[5] = input[stride5] + input[stride(32 - 6)];

	1378 step[6] = input[stride6] + input[stride(32 - 7)];

	1379 step[7] = input[stride7] + input[stride(32 - 8)];

	1380 step[8] = input[stride8] + input[stride(32 - 9)];

	1381 step[9] = input[stride9] + input[stride(32 - 10)];

	1382 step[10] = input[stride10] + input[stride(32 - 11)];

	1383 step[11] = input[stride11] + input[stride(32 - 12)];

	1384 step[12] = input[stride12] + input[stride(32 - 13)];

	1385 step[13] = input[stride13] + input[stride(32 - 14)];

	1386 step[14] = input[stride14] + input[stride(32 - 15)];

	1387 step[15] = input[stride15] + input[stride(32 - 16)];

	1388 step[16] = -input[stride16] + input[stride(32 - 17)];

	1389 step[17] = -input[stride17] + input[stride(32 - 18)];

	1390 step[18] = -input[stride18] + input[stride(32 - 19)];

	1391 step[19] = -input[stride19] + input[stride(32 - 20)];

	1392 step[20] = -input[stride20] + input[stride(32 - 21)];

	1393 step[21] = -input[stride21] + input[stride(32 - 22)];

	1394 step[22] = -input[stride22] + input[stride(32 - 23)];

	1395 step[23] = -input[stride23] + input[stride(32 - 24)];

	1396 step[24] = -input[stride24] + input[stride(32 - 25)];

	1397 step[25] = -input[stride25] + input[stride(32 - 26)];

	1398 step[26] = -input[stride26] + input[stride(32 - 27)];

	1399 step[27] = -input[stride27] + input[stride(32 - 28)];

	1400 step[28] = -input[stride28] + input[stride(32 - 29)];

	1401 step[29] = -input[stride29] + input[stride(32 - 30)];

	1402 step[30] = -input[stride30] + input[stride(32 - 31)];

	1403 step[31] = -input[stride31] + input[stride(32 - 32)];

	1404

	1405 // Stage 2

	1406 output[stride*0] = step[0] + step[16 - 1];

	1407 output[stride*1] = step[1] + step[16 - 2];

	1408 output[stride*2] = step[2] + step[16 - 3];

	1409 output[stride*3] = step[3] + step[16 - 4];

	1410 output[stride*4] = step[4] + step[16 - 5];

	1411 output[stride*5] = step[5] + step[16 - 6];

	1412 output[stride*6] = step[6] + step[16 - 7];

	1413 output[stride*7] = step[7] + step[16 - 8];

	1414 output[stride*8] = -step[8] + step[16 - 9];

	1415 output[stride*9] = -step[9] + step[16 - 10];

	1416 output[stride*10] = -step[10] + step[16 - 11];

	1417 output[stride*11] = -step[11] + step[16 - 12];

	1418 output[stride*12] = -step[12] + step[16 - 13];

	1419 output[stride*13] = -step[13] + step[16 - 14];

	1420 output[stride*14] = -step[14] + step[16 - 15];

	1421 output[stride*15] = -step[15] + step[16 - 16];

	1422

	1423 output[stride*16] = step[16];

	1424 output[stride*17] = step[17];

	1425 output[stride*18] = step[18];

	1426 output[stride*19] = step[19];

	1427

	1428 output[stride20] = (-step[20] + step[27])C16;

	1429 output[stride21] = (-step[21] + step[26])C16;

	1430 output[stride22] = (-step[22] + step[25])C16;

	1431 output[stride23] = (-step[23] + step[24])C16;

	1432

	1433 output[stride24] = (step[24] + step[23])C16;

	1434 output[stride25] = (step[25] + step[22])C16;

	1435 output[stride26] = (step[26] + step[21])C16;

	1436 output[stride27] = (step[27] + step[20])C16;

	1437

	1438 output[stride*28] = step[28];

	1439 output[stride*29] = step[29];

	1440 output[stride*30] = step[30];

	1441 output[stride*31] = step[31];

	1442

	1443 // Stage 3

	1444 step[0] = output[stride0] + output[stride(8 - 1)];

	1445 step[1] = output[stride1] + output[stride(8 - 2)];

	1446 step[2] = output[stride2] + output[stride(8 - 3)];

	1447 step[3] = output[stride3] + output[stride(8 - 4)];

	1448 step[4] = -output[stride4] + output[stride(8 - 5)];

	1449 step[5] = -output[stride5] + output[stride(8 - 6)];

	1450 step[6] = -output[stride6] + output[stride(8 - 7)];

	1451 step[7] = -output[stride7] + output[stride(8 - 8)];

	1452 step[8] = output[stride*8];

	1453 step[9] = output[stride*9];

	1454 step[10] = (-output[stride10] + output[stride13])*C16;

	1455 step[11] = (-output[stride11] + output[stride12])*C16;

	1456 step[12] = (output[stride12] + output[stride11])*C16;

	1457 step[13] = (output[stride13] + output[stride10])*C16;

	1458 step[14] = output[stride*14];

	1459 step[15] = output[stride*15];

	1460

	1461 step[16] = output[stride16] + output[stride23];

	1462 step[17] = output[stride17] + output[stride22];

	1463 step[18] = output[stride18] + output[stride21];

	1464 step[19] = output[stride19] + output[stride20];

	1465 step[20] = -output[stride20] + output[stride19];

	1466 step[21] = -output[stride21] + output[stride18];

	1467 step[22] = -output[stride22] + output[stride17];

	1468 step[23] = -output[stride23] + output[stride16];

	1469 step[24] = -output[stride24] + output[stride31];

	1470 step[25] = -output[stride25] + output[stride30];

	1471 step[26] = -output[stride26] + output[stride29];

	1472 step[27] = -output[stride27] + output[stride28];

	1473 step[28] = output[stride28] + output[stride27];

	1474 step[29] = output[stride29] + output[stride26];

	1475 step[30] = output[stride30] + output[stride25];

	1476 step[31] = output[stride31] + output[stride24];

	1477

	1478 // Stage 4

	1479 output[stride*0] = step[0] + step[3];

	1480 output[stride*1] = step[1] + step[2];

	1481 output[stride*2] = -step[2] + step[1];

	1482 output[stride*3] = -step[3] + step[0];

	1483 output[stride*4] = step[4];

	1484 output[stride5] = (-step[5] + step[6])C16;

	1485 output[stride6] = (step[6] + step[5])C16;

	1486 output[stride*7] = step[7];

	1487 output[stride*8] = step[8] + step[11];

	1488 output[stride*9] = step[9] + step[10];

	1489 output[stride*10] = -step[10] + step[9];

	1490 output[stride*11] = -step[11] + step[8];

	1491 output[stride*12] = -step[12] + step[15];

	1492 output[stride*13] = -step[13] + step[14];

	1493 output[stride*14] = step[14] + step[13];

	1494 output[stride*15] = step[15] + step[12];

	1495

	1496 output[stride*16] = step[16];

	1497 output[stride*17] = step[17];

	1498 output[stride18] = step[18]-C8 + step[29]*C24;

	1499 output[stride19] = step[19]-C8 + step[28]*C24;

	1500 output[stride20] = step[20]-C24 + step[27]*-C8;

	1501 output[stride21] = step[21]-C24 + step[26]*-C8;

	1502 output[stride*22] = step[22];

	1503 output[stride*23] = step[23];

	1504 output[stride*24] = step[24];

	1505 output[stride*25] = step[25];

	1506 output[stride26] = step[26]C24 + step[21]*-C8;

	1507 output[stride27] = step[27]C24 + step[20]*-C8;

	1508 output[stride28] = step[28]C8 + step[19]*C24;

	1509 output[stride29] = step[29]C8 + step[18]*C24;

	1510 output[stride*30] = step[30];

	1511 output[stride*31] = step[31];

	1512

	1513 // Stage 5

	1514 step[0] = (output[stride0] + output[stride1]) * C16;

	1515 step[1] = (-output[stride1] + output[stride0]) * C16;

	1516 step[2] = output[stride2]C24 + output[stride3] C8;

	1517 step[3] = output[stride3]C24 - output[stride2] C8;

	1518 step[4] = output[stride4] + output[stride5];

	1519 step[5] = -output[stride5] + output[stride4];

	1520 step[6] = -output[stride6] + output[stride7];

	1521 step[7] = output[stride7] + output[stride6];

	1522 step[8] = output[stride*8];

	1523 step[9] = output[stride9]-C8 + output[stride14]C24;

	1524 step[10] = output[stride10]-C24 + output[stride13]-C8;

	1525 step[11] = output[stride*11];

	1526 step[12] = output[stride*12];

	1527 step[13] = output[stride13]C24 + output[stride10]-C8;

	1528 step[14] = output[stride14]C8 + output[stride9]C24;

	1529 step[15] = output[stride*15];

	1530

	1531 step[16] = output[stride16] + output[stride19];

	1532 step[17] = output[stride17] + output[stride18];

	1533 step[18] = -output[stride18] + output[stride17];

	1534 step[19] = -output[stride19] + output[stride16];

	1535 step[20] = -output[stride20] + output[stride23];

	1536 step[21] = -output[stride21] + output[stride22];

	1537 step[22] = output[stride22] + output[stride21];

	1538 step[23] = output[stride23] + output[stride20];

	1539 step[24] = output[stride24] + output[stride27];

	1540 step[25] = output[stride25] + output[stride26];

	1541 step[26] = -output[stride26] + output[stride25];

	1542 step[27] = -output[stride27] + output[stride24];

	1543 step[28] = -output[stride28] + output[stride31];

	1544 step[29] = -output[stride29] + output[stride30];

	1545 step[30] = output[stride30] + output[stride29];

	1546 step[31] = output[stride31] + output[stride28];

	1547

	1548 // Stage 6

	1549 output[stride*0] = step[0];

	1550 output[stride*1] = step[1];

	1551 output[stride*2] = step[2];

	1552 output[stride*3] = step[3];

	1553 output[stride4] = step[4]C28 + step[7]*C4;

	1554 output[stride5] = step[5]C12 + step[6]*C20;

	1555 output[stride6] = step[6]C12 + step[5]*-C20;

	1556 output[stride7] = step[7]C28 + step[4]*-C4;

	1557 output[stride*8] = step[8] + step[9];

	1558 output[stride*9] = -step[9] + step[8];

	1559 output[stride*10] = -step[10] + step[11];

	1560 output[stride*11] = step[11] + step[10];

	1561 output[stride*12] = step[12] + step[13];

	1562 output[stride*13] = -step[13] + step[12];

	1563 output[stride*14] = -step[14] + step[15];

	1564 output[stride*15] = step[15] + step[14];

	1565

	1566 output[stride*16] = step[16];

	1567 output[stride17] = step[17]-C4 + step[30]*C28;

	1568 output[stride18] = step[18]-C28 + step[29]*-C4;

	1569 output[stride*19] = step[19];

	1570 output[stride*20] = step[20];

	1571 output[stride21] = step[21]-C20 + step[26]*C12;

	1572 output[stride22] = step[22]-C12 + step[25]*-C20;

	1573 output[stride*23] = step[23];

	1574 output[stride*24] = step[24];

	1575 output[stride25] = step[25]C12 + step[22]*-C20;

	1576 output[stride26] = step[26]C20 + step[21]*C12;

	1577 output[stride*27] = step[27];

	1578 output[stride*28] = step[28];

	1579 output[stride29] = step[29]C28 + step[18]*-C4;

	1580 output[stride30] = step[30]C4 + step[17]*C28;

	1581 output[stride*31] = step[31];

	1582

	1583 // Stage 7

	1584 step[0] = output[stride*0];

	1585 step[1] = output[stride*1];

	1586 step[2] = output[stride*2];

	1587 step[3] = output[stride*3];

	1588 step[4] = output[stride*4];

	1589 step[5] = output[stride*5];

	1590 step[6] = output[stride*6];

	1591 step[7] = output[stride*7];

	1592 step[8] = output[stride8]C30 + output[stride15]C2;

	1593 step[9] = output[stride9]C14 + output[stride14]C18;

	1594 step[10] = output[stride10]C22 + output[stride13]C10;

	1595 step[11] = output[stride11]C6 + output[stride12]C26;

	1596 step[12] = output[stride12]C6 + output[stride11]-C26;

	1597 step[13] = output[stride13]C22 + output[stride10]-C10;

	1598 step[14] = output[stride14]C14 + output[stride9]-C18;

	1599 step[15] = output[stride15]C30 + output[stride8]-C2;

	1600

	1601 step[16] = output[stride16] + output[stride17];

	1602 step[17] = -output[stride17] + output[stride16];

	1603 step[18] = -output[stride18] + output[stride19];

	1604 step[19] = output[stride19] + output[stride18];

	1605 step[20] = output[stride20] + output[stride21];

	1606 step[21] = -output[stride21] + output[stride20];

	1607 step[22] = -output[stride22] + output[stride23];

	1608 step[23] = output[stride23] + output[stride22];

	1609 step[24] = output[stride24] + output[stride25];

	1610 step[25] = -output[stride25] + output[stride24];

	1611 step[26] = -output[stride26] + output[stride27];

	1612 step[27] = output[stride27] + output[stride26];

	1613 step[28] = output[stride28] + output[stride29];

	1614 step[29] = -output[stride29] + output[stride28];

	1615 step[30] = -output[stride30] + output[stride31];

	1616 step[31] = output[stride31] + output[stride30];

	1617

	1618 // Final stage --- outputs indices are bit-reversed.

	1619 output[stride*0] = step[0];

	1620 output[stride*16] = step[1];

	1621 output[stride*8] = step[2];

	1622 output[stride*24] = step[3];

	1623 output[stride*4] = step[4];

	1624 output[stride*20] = step[5];

	1625 output[stride*12] = step[6];

	1626 output[stride*28] = step[7];

	1627 output[stride*2] = step[8];

	1628 output[stride*18] = step[9];

	1629 output[stride*10] = step[10];

	1630 output[stride*26] = step[11];

	1631 output[stride*6] = step[12];

	1632 output[stride*22] = step[13];

	1633 output[stride*14] = step[14];

	1634 output[stride*30] = step[15];

	1635

	1636 output[stride1] = step[16]C31 + step[31]*C1;

	1637 output[stride17] = step[17]C15 + step[30]*C17;

	1638 output[stride9] = step[18]C23 + step[29]*C9;

	1639 output[stride25] = step[19]C7 + step[28]*C25;

	1640 output[stride5] = step[20]C27 + step[27]*C5;

	1641 output[stride21] = step[21]C11 + step[26]*C21;

	1642 output[stride13] = step[22]C19 + step[25]*C13;

	1643 output[stride29] = step[23]C3 + step[24]*C29;

	1644 output[stride3] = step[24]C3 + step[23]*-C29;

	1645 output[stride19] = step[25]C19 + step[22]*-C13;

	1646 output[stride11] = step[26]C11 + step[21]*-C21;

	1647 output[stride27] = step[27]C27 + step[20]*-C5;

	1648 output[stride7] = step[28]C7 + step[19]*-C25;

	1649 output[stride23] = step[29]C23 + step[18]*-C9;

	1650 output[stride15] = step[30]C15 + step[17]*-C17;

	1651 output[stride31] = step[31]C31 + step[16]*-C1;

	1652 }

	1653

	1654 void vp9_short_fdct32x32_c(int16_t input, int16_t out, int pitch) {

	1655 vp9_clear_system_state(); // Make it simd safe : __asm emms;

	1656 {

	1657 int shortpitch = pitch >> 1;

	1658 int i, j;

	1659 double output[1024];

	1660 // First transform columns

	1661 for (i = 0; i < 32; i++) {

	1662 double temp_in[32], temp_out[32];

	1663 for (j = 0; j < 32; j++)

	1664 temp_in[j] = input[j*shortpitch + i];

	1665 dct32_1d(temp_in, temp_out, 1);

	1666 for (j = 0; j < 32; j++)

	1667 output[j*32 + i] = temp_out[j];

	1668 }

	1669 // Then transform rows

	1670 for (i = 0; i < 32; ++i) {

	1671 double temp_in[32], temp_out[32];

	1672 for (j = 0; j < 32; ++j)

	1673 temp_in[j] = output[j + i*32];

	1674 dct32_1d(temp_in, temp_out, 1);

	1675 for (j = 0; j < 32; ++j)

	1676 output[j + i*32] = temp_out[j];

	1677 }

	1678 // Scale by some magic number

	1679 for (i = 0; i < 1024; i++) {

	1680 out[i] = (short)round(output[i]/4);

	1681 }

	1682 }

	1683

	1684 vp9_clear_system_state(); // Make it simd safe : __asm emms;

	1685 }

	1686

	1687 #else // CONFIG_DWTDCTHYBRID

	1688

	1689 #if DWT_TYPE == 53

	1690

	1691 // Note: block length must be even for this implementation

	1692 static void analysis_53_row(int length, short *x,

	1693 short lowpass, short highpass) {

	1694 int n;

	1695 short r, a, b;

	1696

	1697 n = length >> 1;

	1698 b = highpass;

	1699 a = lowpass;

	1700 while (--n) {

	1701 a++ = (r = x++) << 1;

	1702 b++ = x - ((r + x[1] + 1) >> 1);

	1703 x++;

	1704 }

	1705 a = (r = x++) << 1;

	1706 b = x - r;

	1707

	1708 n = length >> 1;

	1709 b = highpass;

	1710 a = lowpass;

	1711 r = *highpass;

	1712 while (n--) {

	1713 a++ += (r + (b) + 1) >> 1;

	1714 r = *b++;

	1715 }

	1716 }

	1717

	1718 static void analysis_53_col(int length, short *x,

	1719 short lowpass, short highpass) {

	1720 int n;

	1721 short r, a, b;

	1722

	1723 n = length >> 1;

	1724 b = highpass;

	1725 a = lowpass;

	1726 while (--n) {

	1727 a++ = (r = x++);

	1728 b++ = (((x) << 1) - (r + x[1]) + 2) >> 2;

	1729 x++;

	1730 }

	1731 a = (r = x++);

	1732 b = (x - r + 1) >> 1;

	1733

	1734 n = length >> 1;

	1735 b = highpass;

	1736 a = lowpass;

	1737 r = *highpass;

	1738 while (n--) {

	1739 a++ += (r + (b) + 1) >> 1;

	1740 r = *b++;

	1741 }

	1742 }

	1743

	1744 static void dyadic_analyze_53(int levels, int width, int height,

	1745 short x, int pitch_x, short c, int pitch_c) {

	1746 int lv, i, j, nh, nw, hh = height, hw = width;

	1747 short buffer[2 * DWT_MAX_LENGTH];

	1748 for (i = 0; i < height; i++) {

	1749 for (j = 0; j < width; j++) {

	1750 c[i * pitch_c + j] = x[i * pitch_x + j] << DWT_PRECISION_BITS;

	1751 }

	1752 }

	1753 for (lv = 0; lv < levels; lv++) {

	1754 nh = hh;

	1755 hh = (hh + 1) >> 1;

	1756 nw = hw;

	1757 hw = (hw + 1) >> 1;

	1758 if ((nh < 2) \|\| (nw < 2)) return;

	1759 for (i = 0; i < nh; i++) {

	1760 memcpy(buffer, &c[i * pitch_c], nw * sizeof(short));

	1761 analysis_53_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw);

	1762 }

	1763 for (j = 0; j < nw; j++) {

	1764 for (i = 0; i < nh; i++)

	1765 buffer[i + nh] = c[i * pitch_c + j];

	1766 analysis_53_col(nh, buffer + nh, buffer, buffer + hh);

	1767 for (i = 0; i < nh; i++)

	1768 c[i * pitch_c + j] = buffer[i];

	1769 }

	1770 }

	1771 }

	1772

	1773 #elif DWT_TYPE == 26

	1774

	1775 static void analysis_26_row(int length, short *x,

	1776 short lowpass, short highpass) {

	1777 int i, n;

	1778 short r, s, a, b;

	1779 a = lowpass;

	1780 b = highpass;

	1781 for (i = length >> 1; i; i--) {

	1782 r = *x++;

	1783 s = *x++;

	1784 *a++ = r + s;

	1785 *b++ = r - s;

	1786 }

	1787 n = length >> 1;

	1788 if (n >= 4) {

	1789 a = lowpass;

	1790 b = highpass;

	1791 r = *lowpass;

	1792 while (--n) {

	1793 *b++ -= (r - a[1] + 4) >> 3;

	1794 r = *a++;

	1795 }

	1796 b -= (r - a + 4) >> 3;

	1797 }

	1798 }

	1799

	1800 static void analysis_26_col(int length, short *x,

	1801 short lowpass, short highpass) {

	1802 int i, n;

	1803 short r, s, a, b;

	1804 a = lowpass;

	1805 b = highpass;

	1806 for (i = length >> 1; i; i--) {

	1807 r = *x++;

	1808 s = *x++;

	1809 *a++ = (r + s + 1) >> 1;

	1810 *b++ = (r - s + 1) >> 1;

	1811 }

	1812 n = length >> 1;

	1813 if (n >= 4) {

	1814 a = lowpass;

	1815 b = highpass;

	1816 r = *lowpass;

	1817 while (--n) {

	1818 *b++ -= (r - a[1] + 4) >> 3;

	1819 r = *a++;

	1820 }

	1821 b -= (r - a + 4) >> 3;

	1822 }

	1823 }

	1824

	1825 static void dyadic_analyze_26(int levels, int width, int height,

	1826 short x, int pitch_x, short c, int pitch_c) {

	1827 int lv, i, j, nh, nw, hh = height, hw = width;

	1828 short buffer[2 * DWT_MAX_LENGTH];

	1829 for (i = 0; i < height; i++) {

	1830 for (j = 0; j < width; j++) {

	1831 c[i * pitch_c + j] = x[i * pitch_x + j] << DWT_PRECISION_BITS;

	1832 }

	1833 }

	1834 for (lv = 0; lv < levels; lv++) {

	1835 nh = hh;

	1836 hh = (hh + 1) >> 1;

	1837 nw = hw;

	1838 hw = (hw + 1) >> 1;

	1839 if ((nh < 2) \|\| (nw < 2)) return;

	1840 for (i = 0; i < nh; i++) {

	1841 memcpy(buffer, &c[i * pitch_c], nw * sizeof(short));

	1842 analysis_26_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw);

	1843 }

	1844 for (j = 0; j < nw; j++) {

	1845 for (i = 0; i < nh; i++)

	1846 buffer[i + nh] = c[i * pitch_c + j];

	1847 analysis_26_col(nh, buffer + nh, buffer, buffer + hh);

	1848 for (i = 0; i < nh; i++)

	1849 c[i * pitch_c + j] = buffer[i];

	1850 }

	1851 }

	1852 }

	1853

	1854 #elif DWT_TYPE == 97

	1855

	1856 static void analysis_97(int length, double *x,

	1857 double lowpass, double highpass) {

	1858 static const double a_predict1 = -1.586134342;

	1859 static const double a_update1 = -0.05298011854;

	1860 static const double a_predict2 = 0.8829110762;

	1861 static const double a_update2 = 0.4435068522;

	1862 static const double s_low = 1.149604398;

	1863 static const double s_high = 1/1.149604398;

	1864 int i;

	1865 double y[DWT_MAX_LENGTH];

	1866 // Predict 1

	1867 for (i = 1; i < length - 2; i += 2) {

	1868 x[i] += a_predict1 * (x[i - 1] + x[i + 1]);

	1869 }

	1870 x[length - 1] += 2 * a_predict1 * x[length - 2];

	1871 // Update 1

	1872 for (i = 2; i < length; i += 2) {

	1873 x[i] += a_update1 * (x[i - 1] + x[i + 1]);

	1874 }

	1875 x[0] += 2 * a_update1 * x[1];

	1876 // Predict 2

	1877 for (i = 1; i < length - 2; i += 2) {

	1878 x[i] += a_predict2 * (x[i - 1] + x[i + 1]);

	1879 }

	1880 x[length - 1] += 2 * a_predict2 * x[length - 2];

	1881 // Update 2

	1882 for (i = 2; i < length; i += 2) {

	1883 x[i] += a_update2 * (x[i - 1] + x[i + 1]);

	1884 }

	1885 x[0] += 2 * a_update2 * x[1];

	1886 memcpy(y, x, sizeof(y) length);

	1887 // Scale and pack

	1888 for (i = 0; i < length / 2; i++) {

	1889 lowpass[i] = y[2 * i] * s_low;

	1890 highpass[i] = y[2 * i + 1] * s_high;

	1891 }

	1892 }

	1893

	1894 static void dyadic_analyze_97(int levels, int width, int height,

	1895 short x, int pitch_x, short c, int pitch_c) {

	1896 int lv, i, j, nh, nw, hh = height, hw = width;

	1897 double buffer[2 * DWT_MAX_LENGTH];

	1898 double y[DWT_MAX_LENGTH * DWT_MAX_LENGTH];

	1899 for (i = 0; i < height; i++) {

	1900 for (j = 0; j < width; j++) {

	1901 y[i * DWT_MAX_LENGTH + j] = x[i * pitch_x + j] << DWT_PRECISION_BITS;

	1902 }

	1903 }

	1904 for (lv = 0; lv < levels; lv++) {

	1905 nh = hh;

	1906 hh = (hh + 1) >> 1;

	1907 nw = hw;

	1908 hw = (hw + 1) >> 1;

	1909 if ((nh < 2) \|\| (nw < 2)) return;

	1910 for (i = 0; i < nh; i++) {

	1911 memcpy(buffer, &y[i * DWT_MAX_LENGTH], nw * sizeof(*buffer));

	1912 analysis_97(nw, buffer, &y[i * DWT_MAX_LENGTH],

	1913 &y[i * DWT_MAX_LENGTH] + hw);

	1914 }

	1915 for (j = 0; j < nw; j++) {

	1916 for (i = 0; i < nh; i++)

	1917 buffer[i + nh] = y[i * DWT_MAX_LENGTH + j];

	1918 analysis_97(nh, buffer + nh, buffer, buffer + hh);

	1919 for (i = 0; i < nh; i++)

	1920 c[i * pitch_c + j] = round(buffer[i]);

	1921 }

	1922 }

	1923 }

	1924

	1925 #endif // DWT_TYPE

	1926

	1927 // TODO(debargha): Implement the scaling differently so as not to have to

	1928 // use the floating point dct

	1929 static void dct16x16_1d_f(double input[16], double output[16]) {

	1930 static const double C1 = 0.995184726672197;

	1931 static const double C2 = 0.98078528040323;

	1932 static const double C3 = 0.956940335732209;

	1933 static const double C4 = 0.923879532511287;

	1934 static const double C5 = 0.881921264348355;

	1935 static const double C6 = 0.831469612302545;

	1936 static const double C7 = 0.773010453362737;

	1937 static const double C8 = 0.707106781186548;

	1938 static const double C9 = 0.634393284163646;

	1939 static const double C10 = 0.555570233019602;

	1940 static const double C11 = 0.471396736825998;

	1941 static const double C12 = 0.38268343236509;

	1942 static const double C13 = 0.290284677254462;

	1943 static const double C14 = 0.195090322016128;

	1944 static const double C15 = 0.098017140329561;

	1945

	1946 vp9_clear_system_state(); // Make it simd safe : __asm emms;

	1947 {

	1948 double step[16];

	1949 double intermediate[16];

	1950 double temp1, temp2;

	1951

	1952 // step 1

	1953 step[ 0] = input[0] + input[15];

	1954 step[ 1] = input[1] + input[14];

	1955 step[ 2] = input[2] + input[13];

	1956 step[ 3] = input[3] + input[12];

	1957 step[ 4] = input[4] + input[11];

	1958 step[ 5] = input[5] + input[10];

	1959 step[ 6] = input[6] + input[ 9];

	1960 step[ 7] = input[7] + input[ 8];

	1961 step[ 8] = input[7] - input[ 8];

	1962 step[ 9] = input[6] - input[ 9];

	1963 step[10] = input[5] - input[10];

	1964 step[11] = input[4] - input[11];

	1965 step[12] = input[3] - input[12];

	1966 step[13] = input[2] - input[13];

	1967 step[14] = input[1] - input[14];

	1968 step[15] = input[0] - input[15];

	1969

	1970 // step 2

	1971 output[0] = step[0] + step[7];

	1972 output[1] = step[1] + step[6];

	1973 output[2] = step[2] + step[5];

	1974 output[3] = step[3] + step[4];

	1975 output[4] = step[3] - step[4];

	1976 output[5] = step[2] - step[5];

	1977 output[6] = step[1] - step[6];

	1978 output[7] = step[0] - step[7];

	1979

	1980 temp1 = step[ 8]*C7;

	1981 temp2 = step[15]*C9;

	1982 output[ 8] = temp1 + temp2;

	1983

	1984 temp1 = step[ 9]*C11;

	1985 temp2 = step[14]*C5;

	1986 output[ 9] = temp1 - temp2;

	1987

	1988 temp1 = step[10]*C3;

	1989 temp2 = step[13]*C13;

	1990 output[10] = temp1 + temp2;

	1991

	1992 temp1 = step[11]*C15;

	1993 temp2 = step[12]*C1;

	1994 output[11] = temp1 - temp2;

	1995

	1996 temp1 = step[11]*C1;

	1997 temp2 = step[12]*C15;

	1998 output[12] = temp2 + temp1;

	1999

	2000 temp1 = step[10]*C13;

	2001 temp2 = step[13]*C3;

	2002 output[13] = temp2 - temp1;

	2003

	2004 temp1 = step[ 9]*C5;

	2005 temp2 = step[14]*C11;

	2006 output[14] = temp2 + temp1;

	2007

	2008 temp1 = step[ 8]*C9;

	2009 temp2 = step[15]*C7;

	2010 output[15] = temp2 - temp1;

	2011

	2012 // step 3

	2013 step[ 0] = output[0] + output[3];

	2014 step[ 1] = output[1] + output[2];

	2015 step[ 2] = output[1] - output[2];

	2016 step[ 3] = output[0] - output[3];

	2017

	2018 temp1 = output[4]*C14;

	2019 temp2 = output[7]*C2;

	2020 step[ 4] = temp1 + temp2;

	2021

	2022 temp1 = output[5]*C10;

	2023 temp2 = output[6]*C6;

	2024 step[ 5] = temp1 + temp2;

	2025

	2026 temp1 = output[5]*C6;

	2027 temp2 = output[6]*C10;

	2028 step[ 6] = temp2 - temp1;

	2029

	2030 temp1 = output[4]*C2;

	2031 temp2 = output[7]*C14;

	2032 step[ 7] = temp2 - temp1;

	2033

	2034 step[ 8] = output[ 8] + output[11];

	2035 step[ 9] = output[ 9] + output[10];

	2036 step[10] = output[ 9] - output[10];

	2037 step[11] = output[ 8] - output[11];

	2038

	2039 step[12] = output[12] + output[15];

	2040 step[13] = output[13] + output[14];

	2041 step[14] = output[13] - output[14];

	2042 step[15] = output[12] - output[15];

	2043

	2044 // step 4

	2045 output[ 0] = (step[ 0] + step[ 1]);

	2046 output[ 8] = (step[ 0] - step[ 1]);

	2047

	2048 temp1 = step[2]*C12;

	2049 temp2 = step[3]*C4;

	2050 temp1 = temp1 + temp2;

	2051 output[ 4] = 2(temp1C8);

	2052

	2053 temp1 = step[2]*C4;

	2054 temp2 = step[3]*C12;

	2055 temp1 = temp2 - temp1;

	2056 output[12] = 2(temp1C8);

	2057

	2058 output[ 2] = 2((step[4] + step[ 5])C8);

	2059 output[14] = 2((step[7] - step[ 6])C8);

	2060

	2061 temp1 = step[4] - step[5];

	2062 temp2 = step[6] + step[7];

	2063 output[ 6] = (temp1 + temp2);

	2064 output[10] = (temp1 - temp2);

	2065

	2066 intermediate[8] = step[8] + step[14];

	2067 intermediate[9] = step[9] + step[15];

	2068

	2069 temp1 = intermediate[8]*C12;

	2070 temp2 = intermediate[9]*C4;

	2071 temp1 = temp1 - temp2;

	2072 output[3] = 2(temp1C8);

	2073

	2074 temp1 = intermediate[8]*C4;

	2075 temp2 = intermediate[9]*C12;

	2076 temp1 = temp2 + temp1;

	2077 output[13] = 2(temp1C8);

	2078

	2079 output[ 9] = 2((step[10] + step[11])C8);

	2080

	2081 intermediate[11] = step[10] - step[11];

	2082 intermediate[12] = step[12] + step[13];

	2083 intermediate[13] = step[12] - step[13];

	2084 intermediate[14] = step[ 8] - step[14];

	2085 intermediate[15] = step[ 9] - step[15];

	2086

	2087 output[15] = (intermediate[11] + intermediate[12]);

	2088 output[ 1] = -(intermediate[11] - intermediate[12]);

	2089

	2090 output[ 7] = 2(intermediate[13]C8);

	2091

	2092 temp1 = intermediate[14]*C12;

	2093 temp2 = intermediate[15]*C4;

	2094 temp1 = temp1 - temp2;

	2095 output[11] = -2(temp1C8);

	2096

	2097 temp1 = intermediate[14]*C4;

	2098 temp2 = intermediate[15]*C12;

	2099 temp1 = temp2 + temp1;

	2100 output[ 5] = 2(temp1C8);

	2101 }

	2102 vp9_clear_system_state(); // Make it simd safe : __asm emms;

	2103 }

	2104

	2105 static void vp9_short_fdct16x16_c_f(short input, short out, int pitch,

	2106 int scale) {

	2107 vp9_clear_system_state(); // Make it simd safe : __asm emms;

	2108 {

	2109 int shortpitch = pitch >> 1;

	2110 int i, j;

	2111 double output[256];

	2112 // First transform columns

	2113 for (i = 0; i < 16; i++) {

	2114 double temp_in[16], temp_out[16];

	2115 for (j = 0; j < 16; j++)

	2116 temp_in[j] = input[j*shortpitch + i];

	2117 dct16x16_1d_f(temp_in, temp_out);

	2118 for (j = 0; j < 16; j++)

	2119 output[j*16 + i] = temp_out[j];

	2120 }

	2121 // Then transform rows

	2122 for (i = 0; i < 16; ++i) {

	2123 double temp_in[16], temp_out[16];

	2124 for (j = 0; j < 16; ++j)

	2125 temp_in[j] = output[j + i*16];

	2126 dct16x16_1d_f(temp_in, temp_out);

	2127 for (j = 0; j < 16; ++j)

	2128 output[j + i*16] = temp_out[j];

	2129 }

	2130 // Scale by some magic number

	2131 for (i = 0; i < 256; i++)

	2132 out[i] = (short)round(output[i] / (2 << scale));

	2133 }

	2134 vp9_clear_system_state(); // Make it simd safe : __asm emms;

	2135 }

	2136

	2137 void vp9_short_fdct8x8_c_f(short block, short coefs, int pitch, int scale) {

	2138 int j1, i, j, k;

	2139 float b[8];

	2140 float b1[8];

	2141 float d[8][8];

	2142 float f0 = (float) .7071068;

	2143 float f1 = (float) .4903926;

	2144 float f2 = (float) .4619398;

	2145 float f3 = (float) .4157348;

	2146 float f4 = (float) .3535534;

	2147 float f5 = (float) .2777851;

	2148 float f6 = (float) .1913417;

	2149 float f7 = (float) .0975452;

	2150 pitch = pitch / 2;

	2151 for (i = 0, k = 0; i < 8; i++, k += pitch) {

	2152 for (j = 0; j < 8; j++) {

	2153 b[j] = (float)(block[k + j] << (3 - scale));

	2154 }

	2155 /* Horizontal transform */

	2156 for (j = 0; j < 4; j++) {

	2157 j1 = 7 - j;

	2158 b1[j] = b[j] + b[j1];

	2159 b1[j1] = b[j] - b[j1];

	2160 }

	2161 b[0] = b1[0] + b1[3];

	2162 b[1] = b1[1] + b1[2];

	2163 b[2] = b1[1] - b1[2];

	2164 b[3] = b1[0] - b1[3];

	2165 b[4] = b1[4];

	2166 b[5] = (b1[6] - b1[5]) * f0;

	2167 b[6] = (b1[6] + b1[5]) * f0;

	2168 b[7] = b1[7];

	2169 d[i][0] = (b[0] + b[1]) * f4;

	2170 d[i][4] = (b[0] - b[1]) * f4;

	2171 d[i][2] = b[2] * f6 + b[3] * f2;

	2172 d[i][6] = b[3] * f6 - b[2] * f2;

	2173 b1[4] = b[4] + b[5];

	2174 b1[7] = b[7] + b[6];

	2175 b1[5] = b[4] - b[5];

	2176 b1[6] = b[7] - b[6];

	2177 d[i][1] = b1[4] * f7 + b1[7] * f1;

	2178 d[i][5] = b1[5] * f3 + b1[6] * f5;

	2179 d[i][7] = b1[7] * f7 - b1[4] * f1;

	2180 d[i][3] = b1[6] * f3 - b1[5] * f5;

	2181 }

	2182 /* Vertical transform */

	2183 for (i = 0; i < 8; i++) {

	2184 for (j = 0; j < 4; j++) {

	2185 j1 = 7 - j;

	2186 b1[j] = d[j][i] + d[j1][i];

	2187 b1[j1] = d[j][i] - d[j1][i];

	2188 }

	2189 b[0] = b1[0] + b1[3];

	2190 b[1] = b1[1] + b1[2];

	2191 b[2] = b1[1] - b1[2];

	2192 b[3] = b1[0] - b1[3];

	2193 b[4] = b1[4];

	2194 b[5] = (b1[6] - b1[5]) * f0;

	2195 b[6] = (b1[6] + b1[5]) * f0;

	2196 b[7] = b1[7];

	2197 d[0][i] = (b[0] + b[1]) * f4;

	2198 d[4][i] = (b[0] - b[1]) * f4;

	2199 d[2][i] = b[2] * f6 + b[3] * f2;

	2200 d[6][i] = b[3] * f6 - b[2] * f2;

	2201 b1[4] = b[4] + b[5];

	2202 b1[7] = b[7] + b[6];

	2203 b1[5] = b[4] - b[5];

	2204 b1[6] = b[7] - b[6];

	2205 d[1][i] = b1[4] * f7 + b1[7] * f1;

	2206 d[5][i] = b1[5] * f3 + b1[6] * f5;

	2207 d[7][i] = b1[7] * f7 - b1[4] * f1;

	2208 d[3][i] = b1[6] * f3 - b1[5] * f5;

	2209 }

	2210 for (i = 0; i < 8; i++) {

	2211 for (j = 0; j < 8; j++) {

	2212 (coefs + j + i 8) = (short) floor(d[i][j] + 0.5);

	2213 }

	2214 }

	2215 return;

	2216 }

	2217

	2218 #define divide_bits(d, n) ((n) < 0 ? (d) << (n) : (d) >> (n))

	2219

	2220 #if DWTDCT_TYPE == DWTDCT16X16_LEAN

	2221

	2222 void vp9_short_fdct32x32_c(short input, short out, int pitch) {

	2223 // assume out is a 32x32 buffer

	2224 short buffer[16 * 16];

	2225 int i, j;

	2226 const int short_pitch = pitch >> 1;

	2227 #if DWT_TYPE == 26

	2228 dyadic_analyze_26(1, 32, 32, input, short_pitch, out, 32);

	2229 #elif DWT_TYPE == 97

	2230 dyadic_analyze_97(1, 32, 32, input, short_pitch, out, 32);

	2231 #elif DWT_TYPE == 53

	2232 dyadic_analyze_53(1, 32, 32, input, short_pitch, out, 32);

	2233 #endif

	2234 // TODO(debargha): Implement more efficiently by adding output pitch

	2235 // argument to the dct16x16 function

	2236 vp9_short_fdct16x16_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS);

	2237 for (i = 0; i < 16; ++i)

	2238 vpx_memcpy(out + i * 32, buffer + i * 16, sizeof(short) * 16);

	2239 for (i = 0; i < 16; ++i) {

	2240 for (j = 16; j < 32; ++j) {

	2241 out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2);

	2242 }

	2243 }

	2244 for (i = 16; i < 32; ++i) {

	2245 for (j = 0; j < 32; ++j) {

	2246 out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2);

	2247 }

	2248 }

	2249 }

	2250

	2251 #elif DWTDCT_TYPE == DWTDCT16X16

	2252

	2253 void vp9_short_fdct32x32_c(short input, short out, int pitch) {

	2254 // assume out is a 32x32 buffer

	2255 short buffer[16 * 16];

	2256 int i, j;

	2257 const int short_pitch = pitch >> 1;

	2258 #if DWT_TYPE == 26

	2259 dyadic_analyze_26(1, 32, 32, input, short_pitch, out, 32);

	2260 #elif DWT_TYPE == 97

	2261 dyadic_analyze_97(1, 32, 32, input, short_pitch, out, 32);

	2262 #elif DWT_TYPE == 53

	2263 dyadic_analyze_53(1, 32, 32, input, short_pitch, out, 32);

	2264 #endif

	2265 // TODO(debargha): Implement more efficiently by adding output pitch

	2266 // argument to the dct16x16 function

	2267 vp9_short_fdct16x16_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS);

	2268 for (i = 0; i < 16; ++i)

	2269 vpx_memcpy(out + i * 32, buffer + i * 16, sizeof(short) * 16);

	2270 vp9_short_fdct16x16_c_f(out + 16, buffer, 64, 1 + DWT_PRECISION_BITS);

	2271 for (i = 0; i < 16; ++i)

	2272 vpx_memcpy(out + i * 32 + 16, buffer + i * 16, sizeof(short) * 16);

	2273

	2274 vp9_short_fdct16x16_c_f(out + 32 * 16, buffer, 64, 1 + DWT_PRECISION_BITS);

	2275 for (i = 0; i < 16; ++i)

	2276 vpx_memcpy(out + i * 32 + 32 * 16, buffer + i * 16, sizeof(short) * 16);

	2277

	2278 vp9_short_fdct16x16_c_f(out + 33 * 16, buffer, 64, 1 + DWT_PRECISION_BITS);

	2279 for (i = 0; i < 16; ++i)

	2280 vpx_memcpy(out + i * 32 + 33 * 16, buffer + i * 16, sizeof(short) * 16);

	2281 }

	2282

	2283 #elif DWTDCT_TYPE == DWTDCT8X8

	2284

	2285 void vp9_short_fdct32x32_c(short input, short out, int pitch) {

	2286 // assume out is a 32x32 buffer

	2287 short buffer[8 * 8];

	2288 int i, j;

	2289 const int short_pitch = pitch >> 1;

	2290 #if DWT_TYPE == 26

	2291 dyadic_analyze_26(2, 32, 32, input, short_pitch, out, 32);

	2292 #elif DWT_TYPE == 97

	2293 dyadic_analyze_97(2, 32, 32, input, short_pitch, out, 32);

	2294 #elif DWT_TYPE == 53

	2295 dyadic_analyze_53(2, 32, 32, input, short_pitch, out, 32);

	2296 #endif

	2297 // TODO(debargha): Implement more efficiently by adding output pitch

	2298 // argument to the dct16x16 function

	2299 vp9_short_fdct8x8_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS);

	2300 for (i = 0; i < 8; ++i)

	2301 vpx_memcpy(out + i * 32, buffer + i * 8, sizeof(short) * 8);

	2302

	2303 vp9_short_fdct8x8_c_f(out + 8, buffer, 64, 1 + DWT_PRECISION_BITS);

	2304 for (i = 0; i < 8; ++i)

	2305 vpx_memcpy(out + i * 32 + 8, buffer + i * 8, sizeof(short) * 8);

	2306

	2307 vp9_short_fdct8x8_c_f(out + 32 * 8, buffer, 64, 1 + DWT_PRECISION_BITS);

	2308 for (i = 0; i < 8; ++i)

	2309 vpx_memcpy(out + i * 32 + 32 * 8, buffer + i * 8, sizeof(short) * 8);

	2310

	2311 vp9_short_fdct8x8_c_f(out + 33 * 8, buffer, 64, 1 + DWT_PRECISION_BITS);

	2312 for (i = 0; i < 8; ++i)

	2313 vpx_memcpy(out + i * 32 + 33 * 8, buffer + i * 8, sizeof(short) * 8);

	2314

	2315 for (i = 0; i < 16; ++i) {

	2316 for (j = 16; j < 32; ++j) {

	2317 out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2);

	2318 }

	2319 }

	2320 for (i = 16; i < 32; ++i) {

	2321 for (j = 0; j < 32; ++j) {

	2322 out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2);

	2323 }

	2324 }

	2325 }

	2326

	2327 #endif

	2328

	2329 #if CONFIG_TX64X64

	2330 void vp9_short_fdct64x64_c(short input, short out, int pitch) {

	2331 // assume out is a 64x64 buffer

	2332 short buffer[16 * 16];

	2333 int i, j;

	2334 const int short_pitch = pitch >> 1;

	2335 #if DWT_TYPE == 26

	2336 dyadic_analyze_26(2, 64, 64, input, short_pitch, out, 64);

	2337 #elif DWT_TYPE == 97

	2338 dyadic_analyze_97(2, 64, 64, input, short_pitch, out, 64);

	2339 #elif DWT_TYPE == 53

	2340 dyadic_analyze_53(2, 64, 64, input, short_pitch, out, 64);

	2341 #endif

	2342 // TODO(debargha): Implement more efficiently by adding output pitch

	2343 // argument to the dct16x16 function

	2344 vp9_short_fdct16x16_c_f(out, buffer, 128, 2 + DWT_PRECISION_BITS);

	2345 for (i = 0; i < 16; ++i)

	2346 vpx_memcpy(out + i * 64, buffer + i * 16, sizeof(short) * 16);

	2347

	2348 #if DWTDCT_TYPE == DWTDCT16X16_LEAN

	2349 for (i = 0; i < 16; ++i) {

	2350 for (j = 16; j < 48; ++j) {

	2351 out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1);

	2352 }

	2353 }

	2354 for (i = 16; i < 64; ++i) {

	2355 for (j = 0; j < 64; ++j) {

	2356 out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1);

	2357 }

	2358 }

	2359 #elif DWTDCT_TYPE == DWTDCT16X16

	2360 vp9_short_fdct16x16_c_f(out + 16, buffer, 128, 2 + DWT_PRECISION_BITS);

	2361 for (i = 0; i < 16; ++i)

	2362 vpx_memcpy(out + i * 64 + 16, buffer + i * 16, sizeof(short) * 16);

	2363

	2364 vp9_short_fdct16x16_c_f(out + 64 * 16, buffer, 128, 2 + DWT_PRECISION_BITS);

	2365 for (i = 0; i < 16; ++i)

	2366 vpx_memcpy(out + i * 64 + 64 * 16, buffer + i * 16, sizeof(short) * 16);

	2367

	2368 vp9_short_fdct16x16_c_f(out + 65 * 16, buffer, 128, 2 + DWT_PRECISION_BITS);

	2369 for (i = 0; i < 16; ++i)

	2370 vpx_memcpy(out + i * 64 + 65 * 16, buffer + i * 16, sizeof(short) * 16);

	2371

	2372 // There is no dct used on the highest bands for now.

	2373 // Need to scale these coeffs by a factor of 2/2^DWT_PRECISION_BITS

	2374 // TODO(debargha): experiment with turning these coeffs to 0

	2375 for (i = 0; i < 32; ++i) {

	2376 for (j = 32; j < 64; ++j) {

	2377 out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1);

	2378 }

	2379 }

	2380 for (i = 32; i < 64; ++i) {

	2381 for (j = 0; j < 64; ++j) {

	2382 out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1);

	2383 }

	2384 }

	2385 #endif // DWTDCT_TYPE

	2386 }

	2387 #endif // CONFIG_TX64X64

	2388 #endif // CONFIG_DWTDCTHYBRID

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/encoder/vp9_boolhuff.h ('k') | source/libvpx/vp9/encoder/vp9_encodeframe.h » ('j') | no next file with comments »