Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(227)

Side by Side Diff: source/libvpx/vp9/encoder/vp9_dct.c

Issue 11974002: libvpx: Pull from upstream (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 7 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « source/libvpx/vp9/encoder/vp9_boolhuff.h ('k') | source/libvpx/vp9/encoder/vp9_encodeframe.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 11
12 #include <assert.h> 12 #include <assert.h>
13 #include <math.h> 13 #include <math.h>
14 #include "vpx_ports/config.h" 14 #include "./vpx_config.h"
15 #include "vp9/common/vp9_systemdependent.h" 15 #include "vp9/common/vp9_systemdependent.h"
16 16
17 #include "vp9/common/vp9_blockd.h" 17 #include "vp9/common/vp9_blockd.h"
18 18
19 // TODO: these transforms can be converted into integer forms to reduce 19 // TODO: these transforms can be converted into integer forms to reduce
20 // the complexity 20 // the complexity
21 static const float dct_4[16] = { 21 static const float dct_4[16] = {
22 0.500000000000000, 0.500000000000000, 0.500000000000000, 0.500000000000000, 22 0.500000000000000, 0.500000000000000, 0.500000000000000, 0.500000000000000,
23 0.653281482438188, 0.270598050073099, -0.270598050073099, -0.653281482438188, 23 0.653281482438188, 0.270598050073099, -0.270598050073099, -0.653281482438188,
24 0.500000000000000, -0.500000000000000, -0.500000000000000, 0.500000000000000, 24 0.500000000000000, -0.500000000000000, -0.500000000000000, 0.500000000000000,
(...skipping 870 matching lines...) Expand 10 before | Expand all | Expand 10 after
895 } 895 }
896 896
897 void vp9_short_walsh8x4_x8_c(short *input, short *output, int pitch) { 897 void vp9_short_walsh8x4_x8_c(short *input, short *output, int pitch) {
898 vp9_short_walsh4x4_x8_c(input, output, pitch); 898 vp9_short_walsh4x4_x8_c(input, output, pitch);
899 vp9_short_walsh4x4_x8_c(input + 4, output + 16, pitch); 899 vp9_short_walsh4x4_x8_c(input + 4, output + 16, pitch);
900 } 900 }
901 #endif 901 #endif
902 902
903 #define TEST_INT_16x16_DCT 1 903 #define TEST_INT_16x16_DCT 1
904 #if !TEST_INT_16x16_DCT 904 #if !TEST_INT_16x16_DCT
905 static const double C1 = 0.995184726672197;
906 static const double C2 = 0.98078528040323;
907 static const double C3 = 0.956940335732209;
908 static const double C4 = 0.923879532511287;
909 static const double C5 = 0.881921264348355;
910 static const double C6 = 0.831469612302545;
911 static const double C7 = 0.773010453362737;
912 static const double C8 = 0.707106781186548;
913 static const double C9 = 0.634393284163646;
914 static const double C10 = 0.555570233019602;
915 static const double C11 = 0.471396736825998;
916 static const double C12 = 0.38268343236509;
917 static const double C13 = 0.290284677254462;
918 static const double C14 = 0.195090322016128;
919 static const double C15 = 0.098017140329561;
920 905
921 static void dct16x16_1d(double input[16], double output[16]) { 906 static void dct16x16_1d(double input[16], double output[16]) {
907 static const double C1 = 0.995184726672197;
908 static const double C2 = 0.98078528040323;
909 static const double C3 = 0.956940335732209;
910 static const double C4 = 0.923879532511287;
911 static const double C5 = 0.881921264348355;
912 static const double C6 = 0.831469612302545;
913 static const double C7 = 0.773010453362737;
914 static const double C8 = 0.707106781186548;
915 static const double C9 = 0.634393284163646;
916 static const double C10 = 0.555570233019602;
917 static const double C11 = 0.471396736825998;
918 static const double C12 = 0.38268343236509;
919 static const double C13 = 0.290284677254462;
920 static const double C14 = 0.195090322016128;
921 static const double C15 = 0.098017140329561;
922
922 vp9_clear_system_state(); // Make it simd safe : __asm emms; 923 vp9_clear_system_state(); // Make it simd safe : __asm emms;
923 { 924 {
924 double step[16]; 925 double step[16];
925 double intermediate[16]; 926 double intermediate[16];
926 double temp1, temp2; 927 double temp1, temp2;
927 928
928 // step 1 929 // step 1
929 step[ 0] = input[0] + input[15]; 930 step[ 0] = input[0] + input[15];
930 step[ 1] = input[1] + input[14]; 931 step[ 1] = input[1] + input[14];
931 step[ 2] = input[2] + input[13]; 932 step[ 2] = input[2] + input[13];
(...skipping 391 matching lines...) Expand 10 before | Expand all | Expand 10 after
1323 // Then transform rows 1324 // Then transform rows
1324 for (i = 0; i < 16; ++i) { 1325 for (i = 0; i < 16; ++i) {
1325 dct16x16_1d(outptr, out, 1); 1326 dct16x16_1d(outptr, out, 1);
1326 outptr += 16; 1327 outptr += 16;
1327 out += 16; 1328 out += 16;
1328 } 1329 }
1329 } 1330 }
1330 #undef RIGHT_SHIFT 1331 #undef RIGHT_SHIFT
1331 #undef ROUNDING 1332 #undef ROUNDING
1332 #endif 1333 #endif
1334
1335 #if !CONFIG_DWTDCTHYBRID
1336 static void dct32_1d(double *input, double *output, int stride) {
1337 static const double C1 = 0.998795456205; // cos(pi * 1 / 64)
1338 static const double C2 = 0.995184726672; // cos(pi * 2 / 64)
1339 static const double C3 = 0.989176509965; // cos(pi * 3 / 64)
1340 static const double C4 = 0.980785280403; // cos(pi * 4 / 64)
1341 static const double C5 = 0.970031253195; // cos(pi * 5 / 64)
1342 static const double C6 = 0.956940335732; // cos(pi * 6 / 64)
1343 static const double C7 = 0.941544065183; // cos(pi * 7 / 64)
1344 static const double C8 = 0.923879532511; // cos(pi * 8 / 64)
1345 static const double C9 = 0.903989293123; // cos(pi * 9 / 64)
1346 static const double C10 = 0.881921264348; // cos(pi * 10 / 64)
1347 static const double C11 = 0.857728610000; // cos(pi * 11 / 64)
1348 static const double C12 = 0.831469612303; // cos(pi * 12 / 64)
1349 static const double C13 = 0.803207531481; // cos(pi * 13 / 64)
1350 static const double C14 = 0.773010453363; // cos(pi * 14 / 64)
1351 static const double C15 = 0.740951125355; // cos(pi * 15 / 64)
1352 static const double C16 = 0.707106781187; // cos(pi * 16 / 64)
1353 static const double C17 = 0.671558954847; // cos(pi * 17 / 64)
1354 static const double C18 = 0.634393284164; // cos(pi * 18 / 64)
1355 static const double C19 = 0.595699304492; // cos(pi * 19 / 64)
1356 static const double C20 = 0.555570233020; // cos(pi * 20 / 64)
1357 static const double C21 = 0.514102744193; // cos(pi * 21 / 64)
1358 static const double C22 = 0.471396736826; // cos(pi * 22 / 64)
1359 static const double C23 = 0.427555093430; // cos(pi * 23 / 64)
1360 static const double C24 = 0.382683432365; // cos(pi * 24 / 64)
1361 static const double C25 = 0.336889853392; // cos(pi * 25 / 64)
1362 static const double C26 = 0.290284677254; // cos(pi * 26 / 64)
1363 static const double C27 = 0.242980179903; // cos(pi * 27 / 64)
1364 static const double C28 = 0.195090322016; // cos(pi * 28 / 64)
1365 static const double C29 = 0.146730474455; // cos(pi * 29 / 64)
1366 static const double C30 = 0.098017140330; // cos(pi * 30 / 64)
1367 static const double C31 = 0.049067674327; // cos(pi * 31 / 64)
1368
1369 double step[32];
1370
1371 // Stage 1
1372 step[0] = input[stride*0] + input[stride*(32 - 1)];
1373 step[1] = input[stride*1] + input[stride*(32 - 2)];
1374 step[2] = input[stride*2] + input[stride*(32 - 3)];
1375 step[3] = input[stride*3] + input[stride*(32 - 4)];
1376 step[4] = input[stride*4] + input[stride*(32 - 5)];
1377 step[5] = input[stride*5] + input[stride*(32 - 6)];
1378 step[6] = input[stride*6] + input[stride*(32 - 7)];
1379 step[7] = input[stride*7] + input[stride*(32 - 8)];
1380 step[8] = input[stride*8] + input[stride*(32 - 9)];
1381 step[9] = input[stride*9] + input[stride*(32 - 10)];
1382 step[10] = input[stride*10] + input[stride*(32 - 11)];
1383 step[11] = input[stride*11] + input[stride*(32 - 12)];
1384 step[12] = input[stride*12] + input[stride*(32 - 13)];
1385 step[13] = input[stride*13] + input[stride*(32 - 14)];
1386 step[14] = input[stride*14] + input[stride*(32 - 15)];
1387 step[15] = input[stride*15] + input[stride*(32 - 16)];
1388 step[16] = -input[stride*16] + input[stride*(32 - 17)];
1389 step[17] = -input[stride*17] + input[stride*(32 - 18)];
1390 step[18] = -input[stride*18] + input[stride*(32 - 19)];
1391 step[19] = -input[stride*19] + input[stride*(32 - 20)];
1392 step[20] = -input[stride*20] + input[stride*(32 - 21)];
1393 step[21] = -input[stride*21] + input[stride*(32 - 22)];
1394 step[22] = -input[stride*22] + input[stride*(32 - 23)];
1395 step[23] = -input[stride*23] + input[stride*(32 - 24)];
1396 step[24] = -input[stride*24] + input[stride*(32 - 25)];
1397 step[25] = -input[stride*25] + input[stride*(32 - 26)];
1398 step[26] = -input[stride*26] + input[stride*(32 - 27)];
1399 step[27] = -input[stride*27] + input[stride*(32 - 28)];
1400 step[28] = -input[stride*28] + input[stride*(32 - 29)];
1401 step[29] = -input[stride*29] + input[stride*(32 - 30)];
1402 step[30] = -input[stride*30] + input[stride*(32 - 31)];
1403 step[31] = -input[stride*31] + input[stride*(32 - 32)];
1404
1405 // Stage 2
1406 output[stride*0] = step[0] + step[16 - 1];
1407 output[stride*1] = step[1] + step[16 - 2];
1408 output[stride*2] = step[2] + step[16 - 3];
1409 output[stride*3] = step[3] + step[16 - 4];
1410 output[stride*4] = step[4] + step[16 - 5];
1411 output[stride*5] = step[5] + step[16 - 6];
1412 output[stride*6] = step[6] + step[16 - 7];
1413 output[stride*7] = step[7] + step[16 - 8];
1414 output[stride*8] = -step[8] + step[16 - 9];
1415 output[stride*9] = -step[9] + step[16 - 10];
1416 output[stride*10] = -step[10] + step[16 - 11];
1417 output[stride*11] = -step[11] + step[16 - 12];
1418 output[stride*12] = -step[12] + step[16 - 13];
1419 output[stride*13] = -step[13] + step[16 - 14];
1420 output[stride*14] = -step[14] + step[16 - 15];
1421 output[stride*15] = -step[15] + step[16 - 16];
1422
1423 output[stride*16] = step[16];
1424 output[stride*17] = step[17];
1425 output[stride*18] = step[18];
1426 output[stride*19] = step[19];
1427
1428 output[stride*20] = (-step[20] + step[27])*C16;
1429 output[stride*21] = (-step[21] + step[26])*C16;
1430 output[stride*22] = (-step[22] + step[25])*C16;
1431 output[stride*23] = (-step[23] + step[24])*C16;
1432
1433 output[stride*24] = (step[24] + step[23])*C16;
1434 output[stride*25] = (step[25] + step[22])*C16;
1435 output[stride*26] = (step[26] + step[21])*C16;
1436 output[stride*27] = (step[27] + step[20])*C16;
1437
1438 output[stride*28] = step[28];
1439 output[stride*29] = step[29];
1440 output[stride*30] = step[30];
1441 output[stride*31] = step[31];
1442
1443 // Stage 3
1444 step[0] = output[stride*0] + output[stride*(8 - 1)];
1445 step[1] = output[stride*1] + output[stride*(8 - 2)];
1446 step[2] = output[stride*2] + output[stride*(8 - 3)];
1447 step[3] = output[stride*3] + output[stride*(8 - 4)];
1448 step[4] = -output[stride*4] + output[stride*(8 - 5)];
1449 step[5] = -output[stride*5] + output[stride*(8 - 6)];
1450 step[6] = -output[stride*6] + output[stride*(8 - 7)];
1451 step[7] = -output[stride*7] + output[stride*(8 - 8)];
1452 step[8] = output[stride*8];
1453 step[9] = output[stride*9];
1454 step[10] = (-output[stride*10] + output[stride*13])*C16;
1455 step[11] = (-output[stride*11] + output[stride*12])*C16;
1456 step[12] = (output[stride*12] + output[stride*11])*C16;
1457 step[13] = (output[stride*13] + output[stride*10])*C16;
1458 step[14] = output[stride*14];
1459 step[15] = output[stride*15];
1460
1461 step[16] = output[stride*16] + output[stride*23];
1462 step[17] = output[stride*17] + output[stride*22];
1463 step[18] = output[stride*18] + output[stride*21];
1464 step[19] = output[stride*19] + output[stride*20];
1465 step[20] = -output[stride*20] + output[stride*19];
1466 step[21] = -output[stride*21] + output[stride*18];
1467 step[22] = -output[stride*22] + output[stride*17];
1468 step[23] = -output[stride*23] + output[stride*16];
1469 step[24] = -output[stride*24] + output[stride*31];
1470 step[25] = -output[stride*25] + output[stride*30];
1471 step[26] = -output[stride*26] + output[stride*29];
1472 step[27] = -output[stride*27] + output[stride*28];
1473 step[28] = output[stride*28] + output[stride*27];
1474 step[29] = output[stride*29] + output[stride*26];
1475 step[30] = output[stride*30] + output[stride*25];
1476 step[31] = output[stride*31] + output[stride*24];
1477
1478 // Stage 4
1479 output[stride*0] = step[0] + step[3];
1480 output[stride*1] = step[1] + step[2];
1481 output[stride*2] = -step[2] + step[1];
1482 output[stride*3] = -step[3] + step[0];
1483 output[stride*4] = step[4];
1484 output[stride*5] = (-step[5] + step[6])*C16;
1485 output[stride*6] = (step[6] + step[5])*C16;
1486 output[stride*7] = step[7];
1487 output[stride*8] = step[8] + step[11];
1488 output[stride*9] = step[9] + step[10];
1489 output[stride*10] = -step[10] + step[9];
1490 output[stride*11] = -step[11] + step[8];
1491 output[stride*12] = -step[12] + step[15];
1492 output[stride*13] = -step[13] + step[14];
1493 output[stride*14] = step[14] + step[13];
1494 output[stride*15] = step[15] + step[12];
1495
1496 output[stride*16] = step[16];
1497 output[stride*17] = step[17];
1498 output[stride*18] = step[18]*-C8 + step[29]*C24;
1499 output[stride*19] = step[19]*-C8 + step[28]*C24;
1500 output[stride*20] = step[20]*-C24 + step[27]*-C8;
1501 output[stride*21] = step[21]*-C24 + step[26]*-C8;
1502 output[stride*22] = step[22];
1503 output[stride*23] = step[23];
1504 output[stride*24] = step[24];
1505 output[stride*25] = step[25];
1506 output[stride*26] = step[26]*C24 + step[21]*-C8;
1507 output[stride*27] = step[27]*C24 + step[20]*-C8;
1508 output[stride*28] = step[28]*C8 + step[19]*C24;
1509 output[stride*29] = step[29]*C8 + step[18]*C24;
1510 output[stride*30] = step[30];
1511 output[stride*31] = step[31];
1512
1513 // Stage 5
1514 step[0] = (output[stride*0] + output[stride*1]) * C16;
1515 step[1] = (-output[stride*1] + output[stride*0]) * C16;
1516 step[2] = output[stride*2]*C24 + output[stride*3] * C8;
1517 step[3] = output[stride*3]*C24 - output[stride*2] * C8;
1518 step[4] = output[stride*4] + output[stride*5];
1519 step[5] = -output[stride*5] + output[stride*4];
1520 step[6] = -output[stride*6] + output[stride*7];
1521 step[7] = output[stride*7] + output[stride*6];
1522 step[8] = output[stride*8];
1523 step[9] = output[stride*9]*-C8 + output[stride*14]*C24;
1524 step[10] = output[stride*10]*-C24 + output[stride*13]*-C8;
1525 step[11] = output[stride*11];
1526 step[12] = output[stride*12];
1527 step[13] = output[stride*13]*C24 + output[stride*10]*-C8;
1528 step[14] = output[stride*14]*C8 + output[stride*9]*C24;
1529 step[15] = output[stride*15];
1530
1531 step[16] = output[stride*16] + output[stride*19];
1532 step[17] = output[stride*17] + output[stride*18];
1533 step[18] = -output[stride*18] + output[stride*17];
1534 step[19] = -output[stride*19] + output[stride*16];
1535 step[20] = -output[stride*20] + output[stride*23];
1536 step[21] = -output[stride*21] + output[stride*22];
1537 step[22] = output[stride*22] + output[stride*21];
1538 step[23] = output[stride*23] + output[stride*20];
1539 step[24] = output[stride*24] + output[stride*27];
1540 step[25] = output[stride*25] + output[stride*26];
1541 step[26] = -output[stride*26] + output[stride*25];
1542 step[27] = -output[stride*27] + output[stride*24];
1543 step[28] = -output[stride*28] + output[stride*31];
1544 step[29] = -output[stride*29] + output[stride*30];
1545 step[30] = output[stride*30] + output[stride*29];
1546 step[31] = output[stride*31] + output[stride*28];
1547
1548 // Stage 6
1549 output[stride*0] = step[0];
1550 output[stride*1] = step[1];
1551 output[stride*2] = step[2];
1552 output[stride*3] = step[3];
1553 output[stride*4] = step[4]*C28 + step[7]*C4;
1554 output[stride*5] = step[5]*C12 + step[6]*C20;
1555 output[stride*6] = step[6]*C12 + step[5]*-C20;
1556 output[stride*7] = step[7]*C28 + step[4]*-C4;
1557 output[stride*8] = step[8] + step[9];
1558 output[stride*9] = -step[9] + step[8];
1559 output[stride*10] = -step[10] + step[11];
1560 output[stride*11] = step[11] + step[10];
1561 output[stride*12] = step[12] + step[13];
1562 output[stride*13] = -step[13] + step[12];
1563 output[stride*14] = -step[14] + step[15];
1564 output[stride*15] = step[15] + step[14];
1565
1566 output[stride*16] = step[16];
1567 output[stride*17] = step[17]*-C4 + step[30]*C28;
1568 output[stride*18] = step[18]*-C28 + step[29]*-C4;
1569 output[stride*19] = step[19];
1570 output[stride*20] = step[20];
1571 output[stride*21] = step[21]*-C20 + step[26]*C12;
1572 output[stride*22] = step[22]*-C12 + step[25]*-C20;
1573 output[stride*23] = step[23];
1574 output[stride*24] = step[24];
1575 output[stride*25] = step[25]*C12 + step[22]*-C20;
1576 output[stride*26] = step[26]*C20 + step[21]*C12;
1577 output[stride*27] = step[27];
1578 output[stride*28] = step[28];
1579 output[stride*29] = step[29]*C28 + step[18]*-C4;
1580 output[stride*30] = step[30]*C4 + step[17]*C28;
1581 output[stride*31] = step[31];
1582
1583 // Stage 7
1584 step[0] = output[stride*0];
1585 step[1] = output[stride*1];
1586 step[2] = output[stride*2];
1587 step[3] = output[stride*3];
1588 step[4] = output[stride*4];
1589 step[5] = output[stride*5];
1590 step[6] = output[stride*6];
1591 step[7] = output[stride*7];
1592 step[8] = output[stride*8]*C30 + output[stride*15]*C2;
1593 step[9] = output[stride*9]*C14 + output[stride*14]*C18;
1594 step[10] = output[stride*10]*C22 + output[stride*13]*C10;
1595 step[11] = output[stride*11]*C6 + output[stride*12]*C26;
1596 step[12] = output[stride*12]*C6 + output[stride*11]*-C26;
1597 step[13] = output[stride*13]*C22 + output[stride*10]*-C10;
1598 step[14] = output[stride*14]*C14 + output[stride*9]*-C18;
1599 step[15] = output[stride*15]*C30 + output[stride*8]*-C2;
1600
1601 step[16] = output[stride*16] + output[stride*17];
1602 step[17] = -output[stride*17] + output[stride*16];
1603 step[18] = -output[stride*18] + output[stride*19];
1604 step[19] = output[stride*19] + output[stride*18];
1605 step[20] = output[stride*20] + output[stride*21];
1606 step[21] = -output[stride*21] + output[stride*20];
1607 step[22] = -output[stride*22] + output[stride*23];
1608 step[23] = output[stride*23] + output[stride*22];
1609 step[24] = output[stride*24] + output[stride*25];
1610 step[25] = -output[stride*25] + output[stride*24];
1611 step[26] = -output[stride*26] + output[stride*27];
1612 step[27] = output[stride*27] + output[stride*26];
1613 step[28] = output[stride*28] + output[stride*29];
1614 step[29] = -output[stride*29] + output[stride*28];
1615 step[30] = -output[stride*30] + output[stride*31];
1616 step[31] = output[stride*31] + output[stride*30];
1617
1618 // Final stage --- outputs indices are bit-reversed.
1619 output[stride*0] = step[0];
1620 output[stride*16] = step[1];
1621 output[stride*8] = step[2];
1622 output[stride*24] = step[3];
1623 output[stride*4] = step[4];
1624 output[stride*20] = step[5];
1625 output[stride*12] = step[6];
1626 output[stride*28] = step[7];
1627 output[stride*2] = step[8];
1628 output[stride*18] = step[9];
1629 output[stride*10] = step[10];
1630 output[stride*26] = step[11];
1631 output[stride*6] = step[12];
1632 output[stride*22] = step[13];
1633 output[stride*14] = step[14];
1634 output[stride*30] = step[15];
1635
1636 output[stride*1] = step[16]*C31 + step[31]*C1;
1637 output[stride*17] = step[17]*C15 + step[30]*C17;
1638 output[stride*9] = step[18]*C23 + step[29]*C9;
1639 output[stride*25] = step[19]*C7 + step[28]*C25;
1640 output[stride*5] = step[20]*C27 + step[27]*C5;
1641 output[stride*21] = step[21]*C11 + step[26]*C21;
1642 output[stride*13] = step[22]*C19 + step[25]*C13;
1643 output[stride*29] = step[23]*C3 + step[24]*C29;
1644 output[stride*3] = step[24]*C3 + step[23]*-C29;
1645 output[stride*19] = step[25]*C19 + step[22]*-C13;
1646 output[stride*11] = step[26]*C11 + step[21]*-C21;
1647 output[stride*27] = step[27]*C27 + step[20]*-C5;
1648 output[stride*7] = step[28]*C7 + step[19]*-C25;
1649 output[stride*23] = step[29]*C23 + step[18]*-C9;
1650 output[stride*15] = step[30]*C15 + step[17]*-C17;
1651 output[stride*31] = step[31]*C31 + step[16]*-C1;
1652 }
1653
1654 void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) {
1655 vp9_clear_system_state(); // Make it simd safe : __asm emms;
1656 {
1657 int shortpitch = pitch >> 1;
1658 int i, j;
1659 double output[1024];
1660 // First transform columns
1661 for (i = 0; i < 32; i++) {
1662 double temp_in[32], temp_out[32];
1663 for (j = 0; j < 32; j++)
1664 temp_in[j] = input[j*shortpitch + i];
1665 dct32_1d(temp_in, temp_out, 1);
1666 for (j = 0; j < 32; j++)
1667 output[j*32 + i] = temp_out[j];
1668 }
1669 // Then transform rows
1670 for (i = 0; i < 32; ++i) {
1671 double temp_in[32], temp_out[32];
1672 for (j = 0; j < 32; ++j)
1673 temp_in[j] = output[j + i*32];
1674 dct32_1d(temp_in, temp_out, 1);
1675 for (j = 0; j < 32; ++j)
1676 output[j + i*32] = temp_out[j];
1677 }
1678 // Scale by some magic number
1679 for (i = 0; i < 1024; i++) {
1680 out[i] = (short)round(output[i]/4);
1681 }
1682 }
1683
1684 vp9_clear_system_state(); // Make it simd safe : __asm emms;
1685 }
1686
1687 #else // CONFIG_DWTDCTHYBRID
1688
1689 #if DWT_TYPE == 53
1690
1691 // Note: block length must be even for this implementation
1692 static void analysis_53_row(int length, short *x,
1693 short *lowpass, short *highpass) {
1694 int n;
1695 short r, *a, *b;
1696
1697 n = length >> 1;
1698 b = highpass;
1699 a = lowpass;
1700 while (--n) {
1701 *a++ = (r = *x++) << 1;
1702 *b++ = *x - ((r + x[1] + 1) >> 1);
1703 x++;
1704 }
1705 *a = (r = *x++) << 1;
1706 *b = *x - r;
1707
1708 n = length >> 1;
1709 b = highpass;
1710 a = lowpass;
1711 r = *highpass;
1712 while (n--) {
1713 *a++ += (r + (*b) + 1) >> 1;
1714 r = *b++;
1715 }
1716 }
1717
1718 static void analysis_53_col(int length, short *x,
1719 short *lowpass, short *highpass) {
1720 int n;
1721 short r, *a, *b;
1722
1723 n = length >> 1;
1724 b = highpass;
1725 a = lowpass;
1726 while (--n) {
1727 *a++ = (r = *x++);
1728 *b++ = (((*x) << 1) - (r + x[1]) + 2) >> 2;
1729 x++;
1730 }
1731 *a = (r = *x++);
1732 *b = (*x - r + 1) >> 1;
1733
1734 n = length >> 1;
1735 b = highpass;
1736 a = lowpass;
1737 r = *highpass;
1738 while (n--) {
1739 *a++ += (r + (*b) + 1) >> 1;
1740 r = *b++;
1741 }
1742 }
1743
1744 static void dyadic_analyze_53(int levels, int width, int height,
1745 short *x, int pitch_x, short *c, int pitch_c) {
1746 int lv, i, j, nh, nw, hh = height, hw = width;
1747 short buffer[2 * DWT_MAX_LENGTH];
1748 for (i = 0; i < height; i++) {
1749 for (j = 0; j < width; j++) {
1750 c[i * pitch_c + j] = x[i * pitch_x + j] << DWT_PRECISION_BITS;
1751 }
1752 }
1753 for (lv = 0; lv < levels; lv++) {
1754 nh = hh;
1755 hh = (hh + 1) >> 1;
1756 nw = hw;
1757 hw = (hw + 1) >> 1;
1758 if ((nh < 2) || (nw < 2)) return;
1759 for (i = 0; i < nh; i++) {
1760 memcpy(buffer, &c[i * pitch_c], nw * sizeof(short));
1761 analysis_53_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw);
1762 }
1763 for (j = 0; j < nw; j++) {
1764 for (i = 0; i < nh; i++)
1765 buffer[i + nh] = c[i * pitch_c + j];
1766 analysis_53_col(nh, buffer + nh, buffer, buffer + hh);
1767 for (i = 0; i < nh; i++)
1768 c[i * pitch_c + j] = buffer[i];
1769 }
1770 }
1771 }
1772
1773 #elif DWT_TYPE == 26
1774
1775 static void analysis_26_row(int length, short *x,
1776 short *lowpass, short *highpass) {
1777 int i, n;
1778 short r, s, *a, *b;
1779 a = lowpass;
1780 b = highpass;
1781 for (i = length >> 1; i; i--) {
1782 r = *x++;
1783 s = *x++;
1784 *a++ = r + s;
1785 *b++ = r - s;
1786 }
1787 n = length >> 1;
1788 if (n >= 4) {
1789 a = lowpass;
1790 b = highpass;
1791 r = *lowpass;
1792 while (--n) {
1793 *b++ -= (r - a[1] + 4) >> 3;
1794 r = *a++;
1795 }
1796 *b -= (r - *a + 4) >> 3;
1797 }
1798 }
1799
1800 static void analysis_26_col(int length, short *x,
1801 short *lowpass, short *highpass) {
1802 int i, n;
1803 short r, s, *a, *b;
1804 a = lowpass;
1805 b = highpass;
1806 for (i = length >> 1; i; i--) {
1807 r = *x++;
1808 s = *x++;
1809 *a++ = (r + s + 1) >> 1;
1810 *b++ = (r - s + 1) >> 1;
1811 }
1812 n = length >> 1;
1813 if (n >= 4) {
1814 a = lowpass;
1815 b = highpass;
1816 r = *lowpass;
1817 while (--n) {
1818 *b++ -= (r - a[1] + 4) >> 3;
1819 r = *a++;
1820 }
1821 *b -= (r - *a + 4) >> 3;
1822 }
1823 }
1824
1825 static void dyadic_analyze_26(int levels, int width, int height,
1826 short *x, int pitch_x, short *c, int pitch_c) {
1827 int lv, i, j, nh, nw, hh = height, hw = width;
1828 short buffer[2 * DWT_MAX_LENGTH];
1829 for (i = 0; i < height; i++) {
1830 for (j = 0; j < width; j++) {
1831 c[i * pitch_c + j] = x[i * pitch_x + j] << DWT_PRECISION_BITS;
1832 }
1833 }
1834 for (lv = 0; lv < levels; lv++) {
1835 nh = hh;
1836 hh = (hh + 1) >> 1;
1837 nw = hw;
1838 hw = (hw + 1) >> 1;
1839 if ((nh < 2) || (nw < 2)) return;
1840 for (i = 0; i < nh; i++) {
1841 memcpy(buffer, &c[i * pitch_c], nw * sizeof(short));
1842 analysis_26_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw);
1843 }
1844 for (j = 0; j < nw; j++) {
1845 for (i = 0; i < nh; i++)
1846 buffer[i + nh] = c[i * pitch_c + j];
1847 analysis_26_col(nh, buffer + nh, buffer, buffer + hh);
1848 for (i = 0; i < nh; i++)
1849 c[i * pitch_c + j] = buffer[i];
1850 }
1851 }
1852 }
1853
1854 #elif DWT_TYPE == 97
1855
1856 static void analysis_97(int length, double *x,
1857 double *lowpass, double *highpass) {
1858 static const double a_predict1 = -1.586134342;
1859 static const double a_update1 = -0.05298011854;
1860 static const double a_predict2 = 0.8829110762;
1861 static const double a_update2 = 0.4435068522;
1862 static const double s_low = 1.149604398;
1863 static const double s_high = 1/1.149604398;
1864 int i;
1865 double y[DWT_MAX_LENGTH];
1866 // Predict 1
1867 for (i = 1; i < length - 2; i += 2) {
1868 x[i] += a_predict1 * (x[i - 1] + x[i + 1]);
1869 }
1870 x[length - 1] += 2 * a_predict1 * x[length - 2];
1871 // Update 1
1872 for (i = 2; i < length; i += 2) {
1873 x[i] += a_update1 * (x[i - 1] + x[i + 1]);
1874 }
1875 x[0] += 2 * a_update1 * x[1];
1876 // Predict 2
1877 for (i = 1; i < length - 2; i += 2) {
1878 x[i] += a_predict2 * (x[i - 1] + x[i + 1]);
1879 }
1880 x[length - 1] += 2 * a_predict2 * x[length - 2];
1881 // Update 2
1882 for (i = 2; i < length; i += 2) {
1883 x[i] += a_update2 * (x[i - 1] + x[i + 1]);
1884 }
1885 x[0] += 2 * a_update2 * x[1];
1886 memcpy(y, x, sizeof(*y) * length);
1887 // Scale and pack
1888 for (i = 0; i < length / 2; i++) {
1889 lowpass[i] = y[2 * i] * s_low;
1890 highpass[i] = y[2 * i + 1] * s_high;
1891 }
1892 }
1893
1894 static void dyadic_analyze_97(int levels, int width, int height,
1895 short *x, int pitch_x, short *c, int pitch_c) {
1896 int lv, i, j, nh, nw, hh = height, hw = width;
1897 double buffer[2 * DWT_MAX_LENGTH];
1898 double y[DWT_MAX_LENGTH * DWT_MAX_LENGTH];
1899 for (i = 0; i < height; i++) {
1900 for (j = 0; j < width; j++) {
1901 y[i * DWT_MAX_LENGTH + j] = x[i * pitch_x + j] << DWT_PRECISION_BITS;
1902 }
1903 }
1904 for (lv = 0; lv < levels; lv++) {
1905 nh = hh;
1906 hh = (hh + 1) >> 1;
1907 nw = hw;
1908 hw = (hw + 1) >> 1;
1909 if ((nh < 2) || (nw < 2)) return;
1910 for (i = 0; i < nh; i++) {
1911 memcpy(buffer, &y[i * DWT_MAX_LENGTH], nw * sizeof(*buffer));
1912 analysis_97(nw, buffer, &y[i * DWT_MAX_LENGTH],
1913 &y[i * DWT_MAX_LENGTH] + hw);
1914 }
1915 for (j = 0; j < nw; j++) {
1916 for (i = 0; i < nh; i++)
1917 buffer[i + nh] = y[i * DWT_MAX_LENGTH + j];
1918 analysis_97(nh, buffer + nh, buffer, buffer + hh);
1919 for (i = 0; i < nh; i++)
1920 c[i * pitch_c + j] = round(buffer[i]);
1921 }
1922 }
1923 }
1924
1925 #endif // DWT_TYPE
1926
1927 // TODO(debargha): Implement the scaling differently so as not to have to
1928 // use the floating point dct
1929 static void dct16x16_1d_f(double input[16], double output[16]) {
1930 static const double C1 = 0.995184726672197;
1931 static const double C2 = 0.98078528040323;
1932 static const double C3 = 0.956940335732209;
1933 static const double C4 = 0.923879532511287;
1934 static const double C5 = 0.881921264348355;
1935 static const double C6 = 0.831469612302545;
1936 static const double C7 = 0.773010453362737;
1937 static const double C8 = 0.707106781186548;
1938 static const double C9 = 0.634393284163646;
1939 static const double C10 = 0.555570233019602;
1940 static const double C11 = 0.471396736825998;
1941 static const double C12 = 0.38268343236509;
1942 static const double C13 = 0.290284677254462;
1943 static const double C14 = 0.195090322016128;
1944 static const double C15 = 0.098017140329561;
1945
1946 vp9_clear_system_state(); // Make it simd safe : __asm emms;
1947 {
1948 double step[16];
1949 double intermediate[16];
1950 double temp1, temp2;
1951
1952 // step 1
1953 step[ 0] = input[0] + input[15];
1954 step[ 1] = input[1] + input[14];
1955 step[ 2] = input[2] + input[13];
1956 step[ 3] = input[3] + input[12];
1957 step[ 4] = input[4] + input[11];
1958 step[ 5] = input[5] + input[10];
1959 step[ 6] = input[6] + input[ 9];
1960 step[ 7] = input[7] + input[ 8];
1961 step[ 8] = input[7] - input[ 8];
1962 step[ 9] = input[6] - input[ 9];
1963 step[10] = input[5] - input[10];
1964 step[11] = input[4] - input[11];
1965 step[12] = input[3] - input[12];
1966 step[13] = input[2] - input[13];
1967 step[14] = input[1] - input[14];
1968 step[15] = input[0] - input[15];
1969
1970 // step 2
1971 output[0] = step[0] + step[7];
1972 output[1] = step[1] + step[6];
1973 output[2] = step[2] + step[5];
1974 output[3] = step[3] + step[4];
1975 output[4] = step[3] - step[4];
1976 output[5] = step[2] - step[5];
1977 output[6] = step[1] - step[6];
1978 output[7] = step[0] - step[7];
1979
1980 temp1 = step[ 8]*C7;
1981 temp2 = step[15]*C9;
1982 output[ 8] = temp1 + temp2;
1983
1984 temp1 = step[ 9]*C11;
1985 temp2 = step[14]*C5;
1986 output[ 9] = temp1 - temp2;
1987
1988 temp1 = step[10]*C3;
1989 temp2 = step[13]*C13;
1990 output[10] = temp1 + temp2;
1991
1992 temp1 = step[11]*C15;
1993 temp2 = step[12]*C1;
1994 output[11] = temp1 - temp2;
1995
1996 temp1 = step[11]*C1;
1997 temp2 = step[12]*C15;
1998 output[12] = temp2 + temp1;
1999
2000 temp1 = step[10]*C13;
2001 temp2 = step[13]*C3;
2002 output[13] = temp2 - temp1;
2003
2004 temp1 = step[ 9]*C5;
2005 temp2 = step[14]*C11;
2006 output[14] = temp2 + temp1;
2007
2008 temp1 = step[ 8]*C9;
2009 temp2 = step[15]*C7;
2010 output[15] = temp2 - temp1;
2011
2012 // step 3
2013 step[ 0] = output[0] + output[3];
2014 step[ 1] = output[1] + output[2];
2015 step[ 2] = output[1] - output[2];
2016 step[ 3] = output[0] - output[3];
2017
2018 temp1 = output[4]*C14;
2019 temp2 = output[7]*C2;
2020 step[ 4] = temp1 + temp2;
2021
2022 temp1 = output[5]*C10;
2023 temp2 = output[6]*C6;
2024 step[ 5] = temp1 + temp2;
2025
2026 temp1 = output[5]*C6;
2027 temp2 = output[6]*C10;
2028 step[ 6] = temp2 - temp1;
2029
2030 temp1 = output[4]*C2;
2031 temp2 = output[7]*C14;
2032 step[ 7] = temp2 - temp1;
2033
2034 step[ 8] = output[ 8] + output[11];
2035 step[ 9] = output[ 9] + output[10];
2036 step[10] = output[ 9] - output[10];
2037 step[11] = output[ 8] - output[11];
2038
2039 step[12] = output[12] + output[15];
2040 step[13] = output[13] + output[14];
2041 step[14] = output[13] - output[14];
2042 step[15] = output[12] - output[15];
2043
2044 // step 4
2045 output[ 0] = (step[ 0] + step[ 1]);
2046 output[ 8] = (step[ 0] - step[ 1]);
2047
2048 temp1 = step[2]*C12;
2049 temp2 = step[3]*C4;
2050 temp1 = temp1 + temp2;
2051 output[ 4] = 2*(temp1*C8);
2052
2053 temp1 = step[2]*C4;
2054 temp2 = step[3]*C12;
2055 temp1 = temp2 - temp1;
2056 output[12] = 2*(temp1*C8);
2057
2058 output[ 2] = 2*((step[4] + step[ 5])*C8);
2059 output[14] = 2*((step[7] - step[ 6])*C8);
2060
2061 temp1 = step[4] - step[5];
2062 temp2 = step[6] + step[7];
2063 output[ 6] = (temp1 + temp2);
2064 output[10] = (temp1 - temp2);
2065
2066 intermediate[8] = step[8] + step[14];
2067 intermediate[9] = step[9] + step[15];
2068
2069 temp1 = intermediate[8]*C12;
2070 temp2 = intermediate[9]*C4;
2071 temp1 = temp1 - temp2;
2072 output[3] = 2*(temp1*C8);
2073
2074 temp1 = intermediate[8]*C4;
2075 temp2 = intermediate[9]*C12;
2076 temp1 = temp2 + temp1;
2077 output[13] = 2*(temp1*C8);
2078
2079 output[ 9] = 2*((step[10] + step[11])*C8);
2080
2081 intermediate[11] = step[10] - step[11];
2082 intermediate[12] = step[12] + step[13];
2083 intermediate[13] = step[12] - step[13];
2084 intermediate[14] = step[ 8] - step[14];
2085 intermediate[15] = step[ 9] - step[15];
2086
2087 output[15] = (intermediate[11] + intermediate[12]);
2088 output[ 1] = -(intermediate[11] - intermediate[12]);
2089
2090 output[ 7] = 2*(intermediate[13]*C8);
2091
2092 temp1 = intermediate[14]*C12;
2093 temp2 = intermediate[15]*C4;
2094 temp1 = temp1 - temp2;
2095 output[11] = -2*(temp1*C8);
2096
2097 temp1 = intermediate[14]*C4;
2098 temp2 = intermediate[15]*C12;
2099 temp1 = temp2 + temp1;
2100 output[ 5] = 2*(temp1*C8);
2101 }
2102 vp9_clear_system_state(); // Make it simd safe : __asm emms;
2103 }
2104
2105 static void vp9_short_fdct16x16_c_f(short *input, short *out, int pitch,
2106 int scale) {
2107 vp9_clear_system_state(); // Make it simd safe : __asm emms;
2108 {
2109 int shortpitch = pitch >> 1;
2110 int i, j;
2111 double output[256];
2112 // First transform columns
2113 for (i = 0; i < 16; i++) {
2114 double temp_in[16], temp_out[16];
2115 for (j = 0; j < 16; j++)
2116 temp_in[j] = input[j*shortpitch + i];
2117 dct16x16_1d_f(temp_in, temp_out);
2118 for (j = 0; j < 16; j++)
2119 output[j*16 + i] = temp_out[j];
2120 }
2121 // Then transform rows
2122 for (i = 0; i < 16; ++i) {
2123 double temp_in[16], temp_out[16];
2124 for (j = 0; j < 16; ++j)
2125 temp_in[j] = output[j + i*16];
2126 dct16x16_1d_f(temp_in, temp_out);
2127 for (j = 0; j < 16; ++j)
2128 output[j + i*16] = temp_out[j];
2129 }
2130 // Scale by some magic number
2131 for (i = 0; i < 256; i++)
2132 out[i] = (short)round(output[i] / (2 << scale));
2133 }
2134 vp9_clear_system_state(); // Make it simd safe : __asm emms;
2135 }
2136
2137 void vp9_short_fdct8x8_c_f(short *block, short *coefs, int pitch, int scale) {
2138 int j1, i, j, k;
2139 float b[8];
2140 float b1[8];
2141 float d[8][8];
2142 float f0 = (float) .7071068;
2143 float f1 = (float) .4903926;
2144 float f2 = (float) .4619398;
2145 float f3 = (float) .4157348;
2146 float f4 = (float) .3535534;
2147 float f5 = (float) .2777851;
2148 float f6 = (float) .1913417;
2149 float f7 = (float) .0975452;
2150 pitch = pitch / 2;
2151 for (i = 0, k = 0; i < 8; i++, k += pitch) {
2152 for (j = 0; j < 8; j++) {
2153 b[j] = (float)(block[k + j] << (3 - scale));
2154 }
2155 /* Horizontal transform */
2156 for (j = 0; j < 4; j++) {
2157 j1 = 7 - j;
2158 b1[j] = b[j] + b[j1];
2159 b1[j1] = b[j] - b[j1];
2160 }
2161 b[0] = b1[0] + b1[3];
2162 b[1] = b1[1] + b1[2];
2163 b[2] = b1[1] - b1[2];
2164 b[3] = b1[0] - b1[3];
2165 b[4] = b1[4];
2166 b[5] = (b1[6] - b1[5]) * f0;
2167 b[6] = (b1[6] + b1[5]) * f0;
2168 b[7] = b1[7];
2169 d[i][0] = (b[0] + b[1]) * f4;
2170 d[i][4] = (b[0] - b[1]) * f4;
2171 d[i][2] = b[2] * f6 + b[3] * f2;
2172 d[i][6] = b[3] * f6 - b[2] * f2;
2173 b1[4] = b[4] + b[5];
2174 b1[7] = b[7] + b[6];
2175 b1[5] = b[4] - b[5];
2176 b1[6] = b[7] - b[6];
2177 d[i][1] = b1[4] * f7 + b1[7] * f1;
2178 d[i][5] = b1[5] * f3 + b1[6] * f5;
2179 d[i][7] = b1[7] * f7 - b1[4] * f1;
2180 d[i][3] = b1[6] * f3 - b1[5] * f5;
2181 }
2182 /* Vertical transform */
2183 for (i = 0; i < 8; i++) {
2184 for (j = 0; j < 4; j++) {
2185 j1 = 7 - j;
2186 b1[j] = d[j][i] + d[j1][i];
2187 b1[j1] = d[j][i] - d[j1][i];
2188 }
2189 b[0] = b1[0] + b1[3];
2190 b[1] = b1[1] + b1[2];
2191 b[2] = b1[1] - b1[2];
2192 b[3] = b1[0] - b1[3];
2193 b[4] = b1[4];
2194 b[5] = (b1[6] - b1[5]) * f0;
2195 b[6] = (b1[6] + b1[5]) * f0;
2196 b[7] = b1[7];
2197 d[0][i] = (b[0] + b[1]) * f4;
2198 d[4][i] = (b[0] - b[1]) * f4;
2199 d[2][i] = b[2] * f6 + b[3] * f2;
2200 d[6][i] = b[3] * f6 - b[2] * f2;
2201 b1[4] = b[4] + b[5];
2202 b1[7] = b[7] + b[6];
2203 b1[5] = b[4] - b[5];
2204 b1[6] = b[7] - b[6];
2205 d[1][i] = b1[4] * f7 + b1[7] * f1;
2206 d[5][i] = b1[5] * f3 + b1[6] * f5;
2207 d[7][i] = b1[7] * f7 - b1[4] * f1;
2208 d[3][i] = b1[6] * f3 - b1[5] * f5;
2209 }
2210 for (i = 0; i < 8; i++) {
2211 for (j = 0; j < 8; j++) {
2212 *(coefs + j + i * 8) = (short) floor(d[i][j] + 0.5);
2213 }
2214 }
2215 return;
2216 }
2217
2218 #define divide_bits(d, n) ((n) < 0 ? (d) << (n) : (d) >> (n))
2219
2220 #if DWTDCT_TYPE == DWTDCT16X16_LEAN
2221
2222 void vp9_short_fdct32x32_c(short *input, short *out, int pitch) {
2223 // assume out is a 32x32 buffer
2224 short buffer[16 * 16];
2225 int i, j;
2226 const int short_pitch = pitch >> 1;
2227 #if DWT_TYPE == 26
2228 dyadic_analyze_26(1, 32, 32, input, short_pitch, out, 32);
2229 #elif DWT_TYPE == 97
2230 dyadic_analyze_97(1, 32, 32, input, short_pitch, out, 32);
2231 #elif DWT_TYPE == 53
2232 dyadic_analyze_53(1, 32, 32, input, short_pitch, out, 32);
2233 #endif
2234 // TODO(debargha): Implement more efficiently by adding output pitch
2235 // argument to the dct16x16 function
2236 vp9_short_fdct16x16_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS);
2237 for (i = 0; i < 16; ++i)
2238 vpx_memcpy(out + i * 32, buffer + i * 16, sizeof(short) * 16);
2239 for (i = 0; i < 16; ++i) {
2240 for (j = 16; j < 32; ++j) {
2241 out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2);
2242 }
2243 }
2244 for (i = 16; i < 32; ++i) {
2245 for (j = 0; j < 32; ++j) {
2246 out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2);
2247 }
2248 }
2249 }
2250
2251 #elif DWTDCT_TYPE == DWTDCT16X16
2252
2253 void vp9_short_fdct32x32_c(short *input, short *out, int pitch) {
2254 // assume out is a 32x32 buffer
2255 short buffer[16 * 16];
2256 int i, j;
2257 const int short_pitch = pitch >> 1;
2258 #if DWT_TYPE == 26
2259 dyadic_analyze_26(1, 32, 32, input, short_pitch, out, 32);
2260 #elif DWT_TYPE == 97
2261 dyadic_analyze_97(1, 32, 32, input, short_pitch, out, 32);
2262 #elif DWT_TYPE == 53
2263 dyadic_analyze_53(1, 32, 32, input, short_pitch, out, 32);
2264 #endif
2265 // TODO(debargha): Implement more efficiently by adding output pitch
2266 // argument to the dct16x16 function
2267 vp9_short_fdct16x16_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS);
2268 for (i = 0; i < 16; ++i)
2269 vpx_memcpy(out + i * 32, buffer + i * 16, sizeof(short) * 16);
2270 vp9_short_fdct16x16_c_f(out + 16, buffer, 64, 1 + DWT_PRECISION_BITS);
2271 for (i = 0; i < 16; ++i)
2272 vpx_memcpy(out + i * 32 + 16, buffer + i * 16, sizeof(short) * 16);
2273
2274 vp9_short_fdct16x16_c_f(out + 32 * 16, buffer, 64, 1 + DWT_PRECISION_BITS);
2275 for (i = 0; i < 16; ++i)
2276 vpx_memcpy(out + i * 32 + 32 * 16, buffer + i * 16, sizeof(short) * 16);
2277
2278 vp9_short_fdct16x16_c_f(out + 33 * 16, buffer, 64, 1 + DWT_PRECISION_BITS);
2279 for (i = 0; i < 16; ++i)
2280 vpx_memcpy(out + i * 32 + 33 * 16, buffer + i * 16, sizeof(short) * 16);
2281 }
2282
2283 #elif DWTDCT_TYPE == DWTDCT8X8
2284
2285 void vp9_short_fdct32x32_c(short *input, short *out, int pitch) {
2286 // assume out is a 32x32 buffer
2287 short buffer[8 * 8];
2288 int i, j;
2289 const int short_pitch = pitch >> 1;
2290 #if DWT_TYPE == 26
2291 dyadic_analyze_26(2, 32, 32, input, short_pitch, out, 32);
2292 #elif DWT_TYPE == 97
2293 dyadic_analyze_97(2, 32, 32, input, short_pitch, out, 32);
2294 #elif DWT_TYPE == 53
2295 dyadic_analyze_53(2, 32, 32, input, short_pitch, out, 32);
2296 #endif
2297 // TODO(debargha): Implement more efficiently by adding output pitch
2298 // argument to the dct16x16 function
2299 vp9_short_fdct8x8_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS);
2300 for (i = 0; i < 8; ++i)
2301 vpx_memcpy(out + i * 32, buffer + i * 8, sizeof(short) * 8);
2302
2303 vp9_short_fdct8x8_c_f(out + 8, buffer, 64, 1 + DWT_PRECISION_BITS);
2304 for (i = 0; i < 8; ++i)
2305 vpx_memcpy(out + i * 32 + 8, buffer + i * 8, sizeof(short) * 8);
2306
2307 vp9_short_fdct8x8_c_f(out + 32 * 8, buffer, 64, 1 + DWT_PRECISION_BITS);
2308 for (i = 0; i < 8; ++i)
2309 vpx_memcpy(out + i * 32 + 32 * 8, buffer + i * 8, sizeof(short) * 8);
2310
2311 vp9_short_fdct8x8_c_f(out + 33 * 8, buffer, 64, 1 + DWT_PRECISION_BITS);
2312 for (i = 0; i < 8; ++i)
2313 vpx_memcpy(out + i * 32 + 33 * 8, buffer + i * 8, sizeof(short) * 8);
2314
2315 for (i = 0; i < 16; ++i) {
2316 for (j = 16; j < 32; ++j) {
2317 out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2);
2318 }
2319 }
2320 for (i = 16; i < 32; ++i) {
2321 for (j = 0; j < 32; ++j) {
2322 out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2);
2323 }
2324 }
2325 }
2326
2327 #endif
2328
2329 #if CONFIG_TX64X64
2330 void vp9_short_fdct64x64_c(short *input, short *out, int pitch) {
2331 // assume out is a 64x64 buffer
2332 short buffer[16 * 16];
2333 int i, j;
2334 const int short_pitch = pitch >> 1;
2335 #if DWT_TYPE == 26
2336 dyadic_analyze_26(2, 64, 64, input, short_pitch, out, 64);
2337 #elif DWT_TYPE == 97
2338 dyadic_analyze_97(2, 64, 64, input, short_pitch, out, 64);
2339 #elif DWT_TYPE == 53
2340 dyadic_analyze_53(2, 64, 64, input, short_pitch, out, 64);
2341 #endif
2342 // TODO(debargha): Implement more efficiently by adding output pitch
2343 // argument to the dct16x16 function
2344 vp9_short_fdct16x16_c_f(out, buffer, 128, 2 + DWT_PRECISION_BITS);
2345 for (i = 0; i < 16; ++i)
2346 vpx_memcpy(out + i * 64, buffer + i * 16, sizeof(short) * 16);
2347
2348 #if DWTDCT_TYPE == DWTDCT16X16_LEAN
2349 for (i = 0; i < 16; ++i) {
2350 for (j = 16; j < 48; ++j) {
2351 out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1);
2352 }
2353 }
2354 for (i = 16; i < 64; ++i) {
2355 for (j = 0; j < 64; ++j) {
2356 out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1);
2357 }
2358 }
2359 #elif DWTDCT_TYPE == DWTDCT16X16
2360 vp9_short_fdct16x16_c_f(out + 16, buffer, 128, 2 + DWT_PRECISION_BITS);
2361 for (i = 0; i < 16; ++i)
2362 vpx_memcpy(out + i * 64 + 16, buffer + i * 16, sizeof(short) * 16);
2363
2364 vp9_short_fdct16x16_c_f(out + 64 * 16, buffer, 128, 2 + DWT_PRECISION_BITS);
2365 for (i = 0; i < 16; ++i)
2366 vpx_memcpy(out + i * 64 + 64 * 16, buffer + i * 16, sizeof(short) * 16);
2367
2368 vp9_short_fdct16x16_c_f(out + 65 * 16, buffer, 128, 2 + DWT_PRECISION_BITS);
2369 for (i = 0; i < 16; ++i)
2370 vpx_memcpy(out + i * 64 + 65 * 16, buffer + i * 16, sizeof(short) * 16);
2371
2372 // There is no dct used on the highest bands for now.
2373 // Need to scale these coeffs by a factor of 2/2^DWT_PRECISION_BITS
2374 // TODO(debargha): experiment with turning these coeffs to 0
2375 for (i = 0; i < 32; ++i) {
2376 for (j = 32; j < 64; ++j) {
2377 out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1);
2378 }
2379 }
2380 for (i = 32; i < 64; ++i) {
2381 for (j = 0; j < 64; ++j) {
2382 out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1);
2383 }
2384 }
2385 #endif // DWTDCT_TYPE
2386 }
2387 #endif // CONFIG_TX64X64
2388 #endif // CONFIG_DWTDCTHYBRID
OLDNEW
« no previous file with comments | « source/libvpx/vp9/encoder/vp9_boolhuff.h ('k') | source/libvpx/vp9/encoder/vp9_encodeframe.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698