OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 | 11 |
12 #include <assert.h> | 12 #include <assert.h> |
13 #include <math.h> | 13 #include <math.h> |
14 #include "vpx_ports/config.h" | 14 #include "./vpx_config.h" |
15 #include "vp9/common/vp9_systemdependent.h" | 15 #include "vp9/common/vp9_systemdependent.h" |
16 | 16 |
17 #include "vp9/common/vp9_blockd.h" | 17 #include "vp9/common/vp9_blockd.h" |
18 | 18 |
19 // TODO: these transforms can be converted into integer forms to reduce | 19 // TODO: these transforms can be converted into integer forms to reduce |
20 // the complexity | 20 // the complexity |
21 static const float dct_4[16] = { | 21 static const float dct_4[16] = { |
22 0.500000000000000, 0.500000000000000, 0.500000000000000, 0.500000000000000, | 22 0.500000000000000, 0.500000000000000, 0.500000000000000, 0.500000000000000, |
23 0.653281482438188, 0.270598050073099, -0.270598050073099, -0.653281482438188, | 23 0.653281482438188, 0.270598050073099, -0.270598050073099, -0.653281482438188, |
24 0.500000000000000, -0.500000000000000, -0.500000000000000, 0.500000000000000, | 24 0.500000000000000, -0.500000000000000, -0.500000000000000, 0.500000000000000, |
(...skipping 870 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
895 } | 895 } |
896 | 896 |
897 void vp9_short_walsh8x4_x8_c(short *input, short *output, int pitch) { | 897 void vp9_short_walsh8x4_x8_c(short *input, short *output, int pitch) { |
898 vp9_short_walsh4x4_x8_c(input, output, pitch); | 898 vp9_short_walsh4x4_x8_c(input, output, pitch); |
899 vp9_short_walsh4x4_x8_c(input + 4, output + 16, pitch); | 899 vp9_short_walsh4x4_x8_c(input + 4, output + 16, pitch); |
900 } | 900 } |
901 #endif | 901 #endif |
902 | 902 |
903 #define TEST_INT_16x16_DCT 1 | 903 #define TEST_INT_16x16_DCT 1 |
904 #if !TEST_INT_16x16_DCT | 904 #if !TEST_INT_16x16_DCT |
905 static const double C1 = 0.995184726672197; | |
906 static const double C2 = 0.98078528040323; | |
907 static const double C3 = 0.956940335732209; | |
908 static const double C4 = 0.923879532511287; | |
909 static const double C5 = 0.881921264348355; | |
910 static const double C6 = 0.831469612302545; | |
911 static const double C7 = 0.773010453362737; | |
912 static const double C8 = 0.707106781186548; | |
913 static const double C9 = 0.634393284163646; | |
914 static const double C10 = 0.555570233019602; | |
915 static const double C11 = 0.471396736825998; | |
916 static const double C12 = 0.38268343236509; | |
917 static const double C13 = 0.290284677254462; | |
918 static const double C14 = 0.195090322016128; | |
919 static const double C15 = 0.098017140329561; | |
920 | 905 |
921 static void dct16x16_1d(double input[16], double output[16]) { | 906 static void dct16x16_1d(double input[16], double output[16]) { |
| 907 static const double C1 = 0.995184726672197; |
| 908 static const double C2 = 0.98078528040323; |
| 909 static const double C3 = 0.956940335732209; |
| 910 static const double C4 = 0.923879532511287; |
| 911 static const double C5 = 0.881921264348355; |
| 912 static const double C6 = 0.831469612302545; |
| 913 static const double C7 = 0.773010453362737; |
| 914 static const double C8 = 0.707106781186548; |
| 915 static const double C9 = 0.634393284163646; |
| 916 static const double C10 = 0.555570233019602; |
| 917 static const double C11 = 0.471396736825998; |
| 918 static const double C12 = 0.38268343236509; |
| 919 static const double C13 = 0.290284677254462; |
| 920 static const double C14 = 0.195090322016128; |
| 921 static const double C15 = 0.098017140329561; |
| 922 |
922 vp9_clear_system_state(); // Make it simd safe : __asm emms; | 923 vp9_clear_system_state(); // Make it simd safe : __asm emms; |
923 { | 924 { |
924 double step[16]; | 925 double step[16]; |
925 double intermediate[16]; | 926 double intermediate[16]; |
926 double temp1, temp2; | 927 double temp1, temp2; |
927 | 928 |
928 // step 1 | 929 // step 1 |
929 step[ 0] = input[0] + input[15]; | 930 step[ 0] = input[0] + input[15]; |
930 step[ 1] = input[1] + input[14]; | 931 step[ 1] = input[1] + input[14]; |
931 step[ 2] = input[2] + input[13]; | 932 step[ 2] = input[2] + input[13]; |
(...skipping 391 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1323 // Then transform rows | 1324 // Then transform rows |
1324 for (i = 0; i < 16; ++i) { | 1325 for (i = 0; i < 16; ++i) { |
1325 dct16x16_1d(outptr, out, 1); | 1326 dct16x16_1d(outptr, out, 1); |
1326 outptr += 16; | 1327 outptr += 16; |
1327 out += 16; | 1328 out += 16; |
1328 } | 1329 } |
1329 } | 1330 } |
1330 #undef RIGHT_SHIFT | 1331 #undef RIGHT_SHIFT |
1331 #undef ROUNDING | 1332 #undef ROUNDING |
1332 #endif | 1333 #endif |
| 1334 |
| 1335 #if !CONFIG_DWTDCTHYBRID |
| 1336 static void dct32_1d(double *input, double *output, int stride) { |
| 1337 static const double C1 = 0.998795456205; // cos(pi * 1 / 64) |
| 1338 static const double C2 = 0.995184726672; // cos(pi * 2 / 64) |
| 1339 static const double C3 = 0.989176509965; // cos(pi * 3 / 64) |
| 1340 static const double C4 = 0.980785280403; // cos(pi * 4 / 64) |
| 1341 static const double C5 = 0.970031253195; // cos(pi * 5 / 64) |
| 1342 static const double C6 = 0.956940335732; // cos(pi * 6 / 64) |
| 1343 static const double C7 = 0.941544065183; // cos(pi * 7 / 64) |
| 1344 static const double C8 = 0.923879532511; // cos(pi * 8 / 64) |
| 1345 static const double C9 = 0.903989293123; // cos(pi * 9 / 64) |
| 1346 static const double C10 = 0.881921264348; // cos(pi * 10 / 64) |
| 1347 static const double C11 = 0.857728610000; // cos(pi * 11 / 64) |
| 1348 static const double C12 = 0.831469612303; // cos(pi * 12 / 64) |
| 1349 static const double C13 = 0.803207531481; // cos(pi * 13 / 64) |
| 1350 static const double C14 = 0.773010453363; // cos(pi * 14 / 64) |
| 1351 static const double C15 = 0.740951125355; // cos(pi * 15 / 64) |
| 1352 static const double C16 = 0.707106781187; // cos(pi * 16 / 64) |
| 1353 static const double C17 = 0.671558954847; // cos(pi * 17 / 64) |
| 1354 static const double C18 = 0.634393284164; // cos(pi * 18 / 64) |
| 1355 static const double C19 = 0.595699304492; // cos(pi * 19 / 64) |
| 1356 static const double C20 = 0.555570233020; // cos(pi * 20 / 64) |
| 1357 static const double C21 = 0.514102744193; // cos(pi * 21 / 64) |
| 1358 static const double C22 = 0.471396736826; // cos(pi * 22 / 64) |
| 1359 static const double C23 = 0.427555093430; // cos(pi * 23 / 64) |
| 1360 static const double C24 = 0.382683432365; // cos(pi * 24 / 64) |
| 1361 static const double C25 = 0.336889853392; // cos(pi * 25 / 64) |
| 1362 static const double C26 = 0.290284677254; // cos(pi * 26 / 64) |
| 1363 static const double C27 = 0.242980179903; // cos(pi * 27 / 64) |
| 1364 static const double C28 = 0.195090322016; // cos(pi * 28 / 64) |
| 1365 static const double C29 = 0.146730474455; // cos(pi * 29 / 64) |
| 1366 static const double C30 = 0.098017140330; // cos(pi * 30 / 64) |
| 1367 static const double C31 = 0.049067674327; // cos(pi * 31 / 64) |
| 1368 |
| 1369 double step[32]; |
| 1370 |
| 1371 // Stage 1 |
| 1372 step[0] = input[stride*0] + input[stride*(32 - 1)]; |
| 1373 step[1] = input[stride*1] + input[stride*(32 - 2)]; |
| 1374 step[2] = input[stride*2] + input[stride*(32 - 3)]; |
| 1375 step[3] = input[stride*3] + input[stride*(32 - 4)]; |
| 1376 step[4] = input[stride*4] + input[stride*(32 - 5)]; |
| 1377 step[5] = input[stride*5] + input[stride*(32 - 6)]; |
| 1378 step[6] = input[stride*6] + input[stride*(32 - 7)]; |
| 1379 step[7] = input[stride*7] + input[stride*(32 - 8)]; |
| 1380 step[8] = input[stride*8] + input[stride*(32 - 9)]; |
| 1381 step[9] = input[stride*9] + input[stride*(32 - 10)]; |
| 1382 step[10] = input[stride*10] + input[stride*(32 - 11)]; |
| 1383 step[11] = input[stride*11] + input[stride*(32 - 12)]; |
| 1384 step[12] = input[stride*12] + input[stride*(32 - 13)]; |
| 1385 step[13] = input[stride*13] + input[stride*(32 - 14)]; |
| 1386 step[14] = input[stride*14] + input[stride*(32 - 15)]; |
| 1387 step[15] = input[stride*15] + input[stride*(32 - 16)]; |
| 1388 step[16] = -input[stride*16] + input[stride*(32 - 17)]; |
| 1389 step[17] = -input[stride*17] + input[stride*(32 - 18)]; |
| 1390 step[18] = -input[stride*18] + input[stride*(32 - 19)]; |
| 1391 step[19] = -input[stride*19] + input[stride*(32 - 20)]; |
| 1392 step[20] = -input[stride*20] + input[stride*(32 - 21)]; |
| 1393 step[21] = -input[stride*21] + input[stride*(32 - 22)]; |
| 1394 step[22] = -input[stride*22] + input[stride*(32 - 23)]; |
| 1395 step[23] = -input[stride*23] + input[stride*(32 - 24)]; |
| 1396 step[24] = -input[stride*24] + input[stride*(32 - 25)]; |
| 1397 step[25] = -input[stride*25] + input[stride*(32 - 26)]; |
| 1398 step[26] = -input[stride*26] + input[stride*(32 - 27)]; |
| 1399 step[27] = -input[stride*27] + input[stride*(32 - 28)]; |
| 1400 step[28] = -input[stride*28] + input[stride*(32 - 29)]; |
| 1401 step[29] = -input[stride*29] + input[stride*(32 - 30)]; |
| 1402 step[30] = -input[stride*30] + input[stride*(32 - 31)]; |
| 1403 step[31] = -input[stride*31] + input[stride*(32 - 32)]; |
| 1404 |
| 1405 // Stage 2 |
| 1406 output[stride*0] = step[0] + step[16 - 1]; |
| 1407 output[stride*1] = step[1] + step[16 - 2]; |
| 1408 output[stride*2] = step[2] + step[16 - 3]; |
| 1409 output[stride*3] = step[3] + step[16 - 4]; |
| 1410 output[stride*4] = step[4] + step[16 - 5]; |
| 1411 output[stride*5] = step[5] + step[16 - 6]; |
| 1412 output[stride*6] = step[6] + step[16 - 7]; |
| 1413 output[stride*7] = step[7] + step[16 - 8]; |
| 1414 output[stride*8] = -step[8] + step[16 - 9]; |
| 1415 output[stride*9] = -step[9] + step[16 - 10]; |
| 1416 output[stride*10] = -step[10] + step[16 - 11]; |
| 1417 output[stride*11] = -step[11] + step[16 - 12]; |
| 1418 output[stride*12] = -step[12] + step[16 - 13]; |
| 1419 output[stride*13] = -step[13] + step[16 - 14]; |
| 1420 output[stride*14] = -step[14] + step[16 - 15]; |
| 1421 output[stride*15] = -step[15] + step[16 - 16]; |
| 1422 |
| 1423 output[stride*16] = step[16]; |
| 1424 output[stride*17] = step[17]; |
| 1425 output[stride*18] = step[18]; |
| 1426 output[stride*19] = step[19]; |
| 1427 |
| 1428 output[stride*20] = (-step[20] + step[27])*C16; |
| 1429 output[stride*21] = (-step[21] + step[26])*C16; |
| 1430 output[stride*22] = (-step[22] + step[25])*C16; |
| 1431 output[stride*23] = (-step[23] + step[24])*C16; |
| 1432 |
| 1433 output[stride*24] = (step[24] + step[23])*C16; |
| 1434 output[stride*25] = (step[25] + step[22])*C16; |
| 1435 output[stride*26] = (step[26] + step[21])*C16; |
| 1436 output[stride*27] = (step[27] + step[20])*C16; |
| 1437 |
| 1438 output[stride*28] = step[28]; |
| 1439 output[stride*29] = step[29]; |
| 1440 output[stride*30] = step[30]; |
| 1441 output[stride*31] = step[31]; |
| 1442 |
| 1443 // Stage 3 |
| 1444 step[0] = output[stride*0] + output[stride*(8 - 1)]; |
| 1445 step[1] = output[stride*1] + output[stride*(8 - 2)]; |
| 1446 step[2] = output[stride*2] + output[stride*(8 - 3)]; |
| 1447 step[3] = output[stride*3] + output[stride*(8 - 4)]; |
| 1448 step[4] = -output[stride*4] + output[stride*(8 - 5)]; |
| 1449 step[5] = -output[stride*5] + output[stride*(8 - 6)]; |
| 1450 step[6] = -output[stride*6] + output[stride*(8 - 7)]; |
| 1451 step[7] = -output[stride*7] + output[stride*(8 - 8)]; |
| 1452 step[8] = output[stride*8]; |
| 1453 step[9] = output[stride*9]; |
| 1454 step[10] = (-output[stride*10] + output[stride*13])*C16; |
| 1455 step[11] = (-output[stride*11] + output[stride*12])*C16; |
| 1456 step[12] = (output[stride*12] + output[stride*11])*C16; |
| 1457 step[13] = (output[stride*13] + output[stride*10])*C16; |
| 1458 step[14] = output[stride*14]; |
| 1459 step[15] = output[stride*15]; |
| 1460 |
| 1461 step[16] = output[stride*16] + output[stride*23]; |
| 1462 step[17] = output[stride*17] + output[stride*22]; |
| 1463 step[18] = output[stride*18] + output[stride*21]; |
| 1464 step[19] = output[stride*19] + output[stride*20]; |
| 1465 step[20] = -output[stride*20] + output[stride*19]; |
| 1466 step[21] = -output[stride*21] + output[stride*18]; |
| 1467 step[22] = -output[stride*22] + output[stride*17]; |
| 1468 step[23] = -output[stride*23] + output[stride*16]; |
| 1469 step[24] = -output[stride*24] + output[stride*31]; |
| 1470 step[25] = -output[stride*25] + output[stride*30]; |
| 1471 step[26] = -output[stride*26] + output[stride*29]; |
| 1472 step[27] = -output[stride*27] + output[stride*28]; |
| 1473 step[28] = output[stride*28] + output[stride*27]; |
| 1474 step[29] = output[stride*29] + output[stride*26]; |
| 1475 step[30] = output[stride*30] + output[stride*25]; |
| 1476 step[31] = output[stride*31] + output[stride*24]; |
| 1477 |
| 1478 // Stage 4 |
| 1479 output[stride*0] = step[0] + step[3]; |
| 1480 output[stride*1] = step[1] + step[2]; |
| 1481 output[stride*2] = -step[2] + step[1]; |
| 1482 output[stride*3] = -step[3] + step[0]; |
| 1483 output[stride*4] = step[4]; |
| 1484 output[stride*5] = (-step[5] + step[6])*C16; |
| 1485 output[stride*6] = (step[6] + step[5])*C16; |
| 1486 output[stride*7] = step[7]; |
| 1487 output[stride*8] = step[8] + step[11]; |
| 1488 output[stride*9] = step[9] + step[10]; |
| 1489 output[stride*10] = -step[10] + step[9]; |
| 1490 output[stride*11] = -step[11] + step[8]; |
| 1491 output[stride*12] = -step[12] + step[15]; |
| 1492 output[stride*13] = -step[13] + step[14]; |
| 1493 output[stride*14] = step[14] + step[13]; |
| 1494 output[stride*15] = step[15] + step[12]; |
| 1495 |
| 1496 output[stride*16] = step[16]; |
| 1497 output[stride*17] = step[17]; |
| 1498 output[stride*18] = step[18]*-C8 + step[29]*C24; |
| 1499 output[stride*19] = step[19]*-C8 + step[28]*C24; |
| 1500 output[stride*20] = step[20]*-C24 + step[27]*-C8; |
| 1501 output[stride*21] = step[21]*-C24 + step[26]*-C8; |
| 1502 output[stride*22] = step[22]; |
| 1503 output[stride*23] = step[23]; |
| 1504 output[stride*24] = step[24]; |
| 1505 output[stride*25] = step[25]; |
| 1506 output[stride*26] = step[26]*C24 + step[21]*-C8; |
| 1507 output[stride*27] = step[27]*C24 + step[20]*-C8; |
| 1508 output[stride*28] = step[28]*C8 + step[19]*C24; |
| 1509 output[stride*29] = step[29]*C8 + step[18]*C24; |
| 1510 output[stride*30] = step[30]; |
| 1511 output[stride*31] = step[31]; |
| 1512 |
| 1513 // Stage 5 |
| 1514 step[0] = (output[stride*0] + output[stride*1]) * C16; |
| 1515 step[1] = (-output[stride*1] + output[stride*0]) * C16; |
| 1516 step[2] = output[stride*2]*C24 + output[stride*3] * C8; |
| 1517 step[3] = output[stride*3]*C24 - output[stride*2] * C8; |
| 1518 step[4] = output[stride*4] + output[stride*5]; |
| 1519 step[5] = -output[stride*5] + output[stride*4]; |
| 1520 step[6] = -output[stride*6] + output[stride*7]; |
| 1521 step[7] = output[stride*7] + output[stride*6]; |
| 1522 step[8] = output[stride*8]; |
| 1523 step[9] = output[stride*9]*-C8 + output[stride*14]*C24; |
| 1524 step[10] = output[stride*10]*-C24 + output[stride*13]*-C8; |
| 1525 step[11] = output[stride*11]; |
| 1526 step[12] = output[stride*12]; |
| 1527 step[13] = output[stride*13]*C24 + output[stride*10]*-C8; |
| 1528 step[14] = output[stride*14]*C8 + output[stride*9]*C24; |
| 1529 step[15] = output[stride*15]; |
| 1530 |
| 1531 step[16] = output[stride*16] + output[stride*19]; |
| 1532 step[17] = output[stride*17] + output[stride*18]; |
| 1533 step[18] = -output[stride*18] + output[stride*17]; |
| 1534 step[19] = -output[stride*19] + output[stride*16]; |
| 1535 step[20] = -output[stride*20] + output[stride*23]; |
| 1536 step[21] = -output[stride*21] + output[stride*22]; |
| 1537 step[22] = output[stride*22] + output[stride*21]; |
| 1538 step[23] = output[stride*23] + output[stride*20]; |
| 1539 step[24] = output[stride*24] + output[stride*27]; |
| 1540 step[25] = output[stride*25] + output[stride*26]; |
| 1541 step[26] = -output[stride*26] + output[stride*25]; |
| 1542 step[27] = -output[stride*27] + output[stride*24]; |
| 1543 step[28] = -output[stride*28] + output[stride*31]; |
| 1544 step[29] = -output[stride*29] + output[stride*30]; |
| 1545 step[30] = output[stride*30] + output[stride*29]; |
| 1546 step[31] = output[stride*31] + output[stride*28]; |
| 1547 |
| 1548 // Stage 6 |
| 1549 output[stride*0] = step[0]; |
| 1550 output[stride*1] = step[1]; |
| 1551 output[stride*2] = step[2]; |
| 1552 output[stride*3] = step[3]; |
| 1553 output[stride*4] = step[4]*C28 + step[7]*C4; |
| 1554 output[stride*5] = step[5]*C12 + step[6]*C20; |
| 1555 output[stride*6] = step[6]*C12 + step[5]*-C20; |
| 1556 output[stride*7] = step[7]*C28 + step[4]*-C4; |
| 1557 output[stride*8] = step[8] + step[9]; |
| 1558 output[stride*9] = -step[9] + step[8]; |
| 1559 output[stride*10] = -step[10] + step[11]; |
| 1560 output[stride*11] = step[11] + step[10]; |
| 1561 output[stride*12] = step[12] + step[13]; |
| 1562 output[stride*13] = -step[13] + step[12]; |
| 1563 output[stride*14] = -step[14] + step[15]; |
| 1564 output[stride*15] = step[15] + step[14]; |
| 1565 |
| 1566 output[stride*16] = step[16]; |
| 1567 output[stride*17] = step[17]*-C4 + step[30]*C28; |
| 1568 output[stride*18] = step[18]*-C28 + step[29]*-C4; |
| 1569 output[stride*19] = step[19]; |
| 1570 output[stride*20] = step[20]; |
| 1571 output[stride*21] = step[21]*-C20 + step[26]*C12; |
| 1572 output[stride*22] = step[22]*-C12 + step[25]*-C20; |
| 1573 output[stride*23] = step[23]; |
| 1574 output[stride*24] = step[24]; |
| 1575 output[stride*25] = step[25]*C12 + step[22]*-C20; |
| 1576 output[stride*26] = step[26]*C20 + step[21]*C12; |
| 1577 output[stride*27] = step[27]; |
| 1578 output[stride*28] = step[28]; |
| 1579 output[stride*29] = step[29]*C28 + step[18]*-C4; |
| 1580 output[stride*30] = step[30]*C4 + step[17]*C28; |
| 1581 output[stride*31] = step[31]; |
| 1582 |
| 1583 // Stage 7 |
| 1584 step[0] = output[stride*0]; |
| 1585 step[1] = output[stride*1]; |
| 1586 step[2] = output[stride*2]; |
| 1587 step[3] = output[stride*3]; |
| 1588 step[4] = output[stride*4]; |
| 1589 step[5] = output[stride*5]; |
| 1590 step[6] = output[stride*6]; |
| 1591 step[7] = output[stride*7]; |
| 1592 step[8] = output[stride*8]*C30 + output[stride*15]*C2; |
| 1593 step[9] = output[stride*9]*C14 + output[stride*14]*C18; |
| 1594 step[10] = output[stride*10]*C22 + output[stride*13]*C10; |
| 1595 step[11] = output[stride*11]*C6 + output[stride*12]*C26; |
| 1596 step[12] = output[stride*12]*C6 + output[stride*11]*-C26; |
| 1597 step[13] = output[stride*13]*C22 + output[stride*10]*-C10; |
| 1598 step[14] = output[stride*14]*C14 + output[stride*9]*-C18; |
| 1599 step[15] = output[stride*15]*C30 + output[stride*8]*-C2; |
| 1600 |
| 1601 step[16] = output[stride*16] + output[stride*17]; |
| 1602 step[17] = -output[stride*17] + output[stride*16]; |
| 1603 step[18] = -output[stride*18] + output[stride*19]; |
| 1604 step[19] = output[stride*19] + output[stride*18]; |
| 1605 step[20] = output[stride*20] + output[stride*21]; |
| 1606 step[21] = -output[stride*21] + output[stride*20]; |
| 1607 step[22] = -output[stride*22] + output[stride*23]; |
| 1608 step[23] = output[stride*23] + output[stride*22]; |
| 1609 step[24] = output[stride*24] + output[stride*25]; |
| 1610 step[25] = -output[stride*25] + output[stride*24]; |
| 1611 step[26] = -output[stride*26] + output[stride*27]; |
| 1612 step[27] = output[stride*27] + output[stride*26]; |
| 1613 step[28] = output[stride*28] + output[stride*29]; |
| 1614 step[29] = -output[stride*29] + output[stride*28]; |
| 1615 step[30] = -output[stride*30] + output[stride*31]; |
| 1616 step[31] = output[stride*31] + output[stride*30]; |
| 1617 |
| 1618 // Final stage --- outputs indices are bit-reversed. |
| 1619 output[stride*0] = step[0]; |
| 1620 output[stride*16] = step[1]; |
| 1621 output[stride*8] = step[2]; |
| 1622 output[stride*24] = step[3]; |
| 1623 output[stride*4] = step[4]; |
| 1624 output[stride*20] = step[5]; |
| 1625 output[stride*12] = step[6]; |
| 1626 output[stride*28] = step[7]; |
| 1627 output[stride*2] = step[8]; |
| 1628 output[stride*18] = step[9]; |
| 1629 output[stride*10] = step[10]; |
| 1630 output[stride*26] = step[11]; |
| 1631 output[stride*6] = step[12]; |
| 1632 output[stride*22] = step[13]; |
| 1633 output[stride*14] = step[14]; |
| 1634 output[stride*30] = step[15]; |
| 1635 |
| 1636 output[stride*1] = step[16]*C31 + step[31]*C1; |
| 1637 output[stride*17] = step[17]*C15 + step[30]*C17; |
| 1638 output[stride*9] = step[18]*C23 + step[29]*C9; |
| 1639 output[stride*25] = step[19]*C7 + step[28]*C25; |
| 1640 output[stride*5] = step[20]*C27 + step[27]*C5; |
| 1641 output[stride*21] = step[21]*C11 + step[26]*C21; |
| 1642 output[stride*13] = step[22]*C19 + step[25]*C13; |
| 1643 output[stride*29] = step[23]*C3 + step[24]*C29; |
| 1644 output[stride*3] = step[24]*C3 + step[23]*-C29; |
| 1645 output[stride*19] = step[25]*C19 + step[22]*-C13; |
| 1646 output[stride*11] = step[26]*C11 + step[21]*-C21; |
| 1647 output[stride*27] = step[27]*C27 + step[20]*-C5; |
| 1648 output[stride*7] = step[28]*C7 + step[19]*-C25; |
| 1649 output[stride*23] = step[29]*C23 + step[18]*-C9; |
| 1650 output[stride*15] = step[30]*C15 + step[17]*-C17; |
| 1651 output[stride*31] = step[31]*C31 + step[16]*-C1; |
| 1652 } |
| 1653 |
| 1654 void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) { |
| 1655 vp9_clear_system_state(); // Make it simd safe : __asm emms; |
| 1656 { |
| 1657 int shortpitch = pitch >> 1; |
| 1658 int i, j; |
| 1659 double output[1024]; |
| 1660 // First transform columns |
| 1661 for (i = 0; i < 32; i++) { |
| 1662 double temp_in[32], temp_out[32]; |
| 1663 for (j = 0; j < 32; j++) |
| 1664 temp_in[j] = input[j*shortpitch + i]; |
| 1665 dct32_1d(temp_in, temp_out, 1); |
| 1666 for (j = 0; j < 32; j++) |
| 1667 output[j*32 + i] = temp_out[j]; |
| 1668 } |
| 1669 // Then transform rows |
| 1670 for (i = 0; i < 32; ++i) { |
| 1671 double temp_in[32], temp_out[32]; |
| 1672 for (j = 0; j < 32; ++j) |
| 1673 temp_in[j] = output[j + i*32]; |
| 1674 dct32_1d(temp_in, temp_out, 1); |
| 1675 for (j = 0; j < 32; ++j) |
| 1676 output[j + i*32] = temp_out[j]; |
| 1677 } |
| 1678 // Scale by some magic number |
| 1679 for (i = 0; i < 1024; i++) { |
| 1680 out[i] = (short)round(output[i]/4); |
| 1681 } |
| 1682 } |
| 1683 |
| 1684 vp9_clear_system_state(); // Make it simd safe : __asm emms; |
| 1685 } |
| 1686 |
| 1687 #else // CONFIG_DWTDCTHYBRID |
| 1688 |
| 1689 #if DWT_TYPE == 53 |
| 1690 |
| 1691 // Note: block length must be even for this implementation |
| 1692 static void analysis_53_row(int length, short *x, |
| 1693 short *lowpass, short *highpass) { |
| 1694 int n; |
| 1695 short r, *a, *b; |
| 1696 |
| 1697 n = length >> 1; |
| 1698 b = highpass; |
| 1699 a = lowpass; |
| 1700 while (--n) { |
| 1701 *a++ = (r = *x++) << 1; |
| 1702 *b++ = *x - ((r + x[1] + 1) >> 1); |
| 1703 x++; |
| 1704 } |
| 1705 *a = (r = *x++) << 1; |
| 1706 *b = *x - r; |
| 1707 |
| 1708 n = length >> 1; |
| 1709 b = highpass; |
| 1710 a = lowpass; |
| 1711 r = *highpass; |
| 1712 while (n--) { |
| 1713 *a++ += (r + (*b) + 1) >> 1; |
| 1714 r = *b++; |
| 1715 } |
| 1716 } |
| 1717 |
| 1718 static void analysis_53_col(int length, short *x, |
| 1719 short *lowpass, short *highpass) { |
| 1720 int n; |
| 1721 short r, *a, *b; |
| 1722 |
| 1723 n = length >> 1; |
| 1724 b = highpass; |
| 1725 a = lowpass; |
| 1726 while (--n) { |
| 1727 *a++ = (r = *x++); |
| 1728 *b++ = (((*x) << 1) - (r + x[1]) + 2) >> 2; |
| 1729 x++; |
| 1730 } |
| 1731 *a = (r = *x++); |
| 1732 *b = (*x - r + 1) >> 1; |
| 1733 |
| 1734 n = length >> 1; |
| 1735 b = highpass; |
| 1736 a = lowpass; |
| 1737 r = *highpass; |
| 1738 while (n--) { |
| 1739 *a++ += (r + (*b) + 1) >> 1; |
| 1740 r = *b++; |
| 1741 } |
| 1742 } |
| 1743 |
| 1744 static void dyadic_analyze_53(int levels, int width, int height, |
| 1745 short *x, int pitch_x, short *c, int pitch_c) { |
| 1746 int lv, i, j, nh, nw, hh = height, hw = width; |
| 1747 short buffer[2 * DWT_MAX_LENGTH]; |
| 1748 for (i = 0; i < height; i++) { |
| 1749 for (j = 0; j < width; j++) { |
| 1750 c[i * pitch_c + j] = x[i * pitch_x + j] << DWT_PRECISION_BITS; |
| 1751 } |
| 1752 } |
| 1753 for (lv = 0; lv < levels; lv++) { |
| 1754 nh = hh; |
| 1755 hh = (hh + 1) >> 1; |
| 1756 nw = hw; |
| 1757 hw = (hw + 1) >> 1; |
| 1758 if ((nh < 2) || (nw < 2)) return; |
| 1759 for (i = 0; i < nh; i++) { |
| 1760 memcpy(buffer, &c[i * pitch_c], nw * sizeof(short)); |
| 1761 analysis_53_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw); |
| 1762 } |
| 1763 for (j = 0; j < nw; j++) { |
| 1764 for (i = 0; i < nh; i++) |
| 1765 buffer[i + nh] = c[i * pitch_c + j]; |
| 1766 analysis_53_col(nh, buffer + nh, buffer, buffer + hh); |
| 1767 for (i = 0; i < nh; i++) |
| 1768 c[i * pitch_c + j] = buffer[i]; |
| 1769 } |
| 1770 } |
| 1771 } |
| 1772 |
| 1773 #elif DWT_TYPE == 26 |
| 1774 |
| 1775 static void analysis_26_row(int length, short *x, |
| 1776 short *lowpass, short *highpass) { |
| 1777 int i, n; |
| 1778 short r, s, *a, *b; |
| 1779 a = lowpass; |
| 1780 b = highpass; |
| 1781 for (i = length >> 1; i; i--) { |
| 1782 r = *x++; |
| 1783 s = *x++; |
| 1784 *a++ = r + s; |
| 1785 *b++ = r - s; |
| 1786 } |
| 1787 n = length >> 1; |
| 1788 if (n >= 4) { |
| 1789 a = lowpass; |
| 1790 b = highpass; |
| 1791 r = *lowpass; |
| 1792 while (--n) { |
| 1793 *b++ -= (r - a[1] + 4) >> 3; |
| 1794 r = *a++; |
| 1795 } |
| 1796 *b -= (r - *a + 4) >> 3; |
| 1797 } |
| 1798 } |
| 1799 |
| 1800 static void analysis_26_col(int length, short *x, |
| 1801 short *lowpass, short *highpass) { |
| 1802 int i, n; |
| 1803 short r, s, *a, *b; |
| 1804 a = lowpass; |
| 1805 b = highpass; |
| 1806 for (i = length >> 1; i; i--) { |
| 1807 r = *x++; |
| 1808 s = *x++; |
| 1809 *a++ = (r + s + 1) >> 1; |
| 1810 *b++ = (r - s + 1) >> 1; |
| 1811 } |
| 1812 n = length >> 1; |
| 1813 if (n >= 4) { |
| 1814 a = lowpass; |
| 1815 b = highpass; |
| 1816 r = *lowpass; |
| 1817 while (--n) { |
| 1818 *b++ -= (r - a[1] + 4) >> 3; |
| 1819 r = *a++; |
| 1820 } |
| 1821 *b -= (r - *a + 4) >> 3; |
| 1822 } |
| 1823 } |
| 1824 |
| 1825 static void dyadic_analyze_26(int levels, int width, int height, |
| 1826 short *x, int pitch_x, short *c, int pitch_c) { |
| 1827 int lv, i, j, nh, nw, hh = height, hw = width; |
| 1828 short buffer[2 * DWT_MAX_LENGTH]; |
| 1829 for (i = 0; i < height; i++) { |
| 1830 for (j = 0; j < width; j++) { |
| 1831 c[i * pitch_c + j] = x[i * pitch_x + j] << DWT_PRECISION_BITS; |
| 1832 } |
| 1833 } |
| 1834 for (lv = 0; lv < levels; lv++) { |
| 1835 nh = hh; |
| 1836 hh = (hh + 1) >> 1; |
| 1837 nw = hw; |
| 1838 hw = (hw + 1) >> 1; |
| 1839 if ((nh < 2) || (nw < 2)) return; |
| 1840 for (i = 0; i < nh; i++) { |
| 1841 memcpy(buffer, &c[i * pitch_c], nw * sizeof(short)); |
| 1842 analysis_26_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw); |
| 1843 } |
| 1844 for (j = 0; j < nw; j++) { |
| 1845 for (i = 0; i < nh; i++) |
| 1846 buffer[i + nh] = c[i * pitch_c + j]; |
| 1847 analysis_26_col(nh, buffer + nh, buffer, buffer + hh); |
| 1848 for (i = 0; i < nh; i++) |
| 1849 c[i * pitch_c + j] = buffer[i]; |
| 1850 } |
| 1851 } |
| 1852 } |
| 1853 |
| 1854 #elif DWT_TYPE == 97 |
| 1855 |
| 1856 static void analysis_97(int length, double *x, |
| 1857 double *lowpass, double *highpass) { |
| 1858 static const double a_predict1 = -1.586134342; |
| 1859 static const double a_update1 = -0.05298011854; |
| 1860 static const double a_predict2 = 0.8829110762; |
| 1861 static const double a_update2 = 0.4435068522; |
| 1862 static const double s_low = 1.149604398; |
| 1863 static const double s_high = 1/1.149604398; |
| 1864 int i; |
| 1865 double y[DWT_MAX_LENGTH]; |
| 1866 // Predict 1 |
| 1867 for (i = 1; i < length - 2; i += 2) { |
| 1868 x[i] += a_predict1 * (x[i - 1] + x[i + 1]); |
| 1869 } |
| 1870 x[length - 1] += 2 * a_predict1 * x[length - 2]; |
| 1871 // Update 1 |
| 1872 for (i = 2; i < length; i += 2) { |
| 1873 x[i] += a_update1 * (x[i - 1] + x[i + 1]); |
| 1874 } |
| 1875 x[0] += 2 * a_update1 * x[1]; |
| 1876 // Predict 2 |
| 1877 for (i = 1; i < length - 2; i += 2) { |
| 1878 x[i] += a_predict2 * (x[i - 1] + x[i + 1]); |
| 1879 } |
| 1880 x[length - 1] += 2 * a_predict2 * x[length - 2]; |
| 1881 // Update 2 |
| 1882 for (i = 2; i < length; i += 2) { |
| 1883 x[i] += a_update2 * (x[i - 1] + x[i + 1]); |
| 1884 } |
| 1885 x[0] += 2 * a_update2 * x[1]; |
| 1886 memcpy(y, x, sizeof(*y) * length); |
| 1887 // Scale and pack |
| 1888 for (i = 0; i < length / 2; i++) { |
| 1889 lowpass[i] = y[2 * i] * s_low; |
| 1890 highpass[i] = y[2 * i + 1] * s_high; |
| 1891 } |
| 1892 } |
| 1893 |
| 1894 static void dyadic_analyze_97(int levels, int width, int height, |
| 1895 short *x, int pitch_x, short *c, int pitch_c) { |
| 1896 int lv, i, j, nh, nw, hh = height, hw = width; |
| 1897 double buffer[2 * DWT_MAX_LENGTH]; |
| 1898 double y[DWT_MAX_LENGTH * DWT_MAX_LENGTH]; |
| 1899 for (i = 0; i < height; i++) { |
| 1900 for (j = 0; j < width; j++) { |
| 1901 y[i * DWT_MAX_LENGTH + j] = x[i * pitch_x + j] << DWT_PRECISION_BITS; |
| 1902 } |
| 1903 } |
| 1904 for (lv = 0; lv < levels; lv++) { |
| 1905 nh = hh; |
| 1906 hh = (hh + 1) >> 1; |
| 1907 nw = hw; |
| 1908 hw = (hw + 1) >> 1; |
| 1909 if ((nh < 2) || (nw < 2)) return; |
| 1910 for (i = 0; i < nh; i++) { |
| 1911 memcpy(buffer, &y[i * DWT_MAX_LENGTH], nw * sizeof(*buffer)); |
| 1912 analysis_97(nw, buffer, &y[i * DWT_MAX_LENGTH], |
| 1913 &y[i * DWT_MAX_LENGTH] + hw); |
| 1914 } |
| 1915 for (j = 0; j < nw; j++) { |
| 1916 for (i = 0; i < nh; i++) |
| 1917 buffer[i + nh] = y[i * DWT_MAX_LENGTH + j]; |
| 1918 analysis_97(nh, buffer + nh, buffer, buffer + hh); |
| 1919 for (i = 0; i < nh; i++) |
| 1920 c[i * pitch_c + j] = round(buffer[i]); |
| 1921 } |
| 1922 } |
| 1923 } |
| 1924 |
| 1925 #endif // DWT_TYPE |
| 1926 |
| 1927 // TODO(debargha): Implement the scaling differently so as not to have to |
| 1928 // use the floating point dct |
| 1929 static void dct16x16_1d_f(double input[16], double output[16]) { |
| 1930 static const double C1 = 0.995184726672197; |
| 1931 static const double C2 = 0.98078528040323; |
| 1932 static const double C3 = 0.956940335732209; |
| 1933 static const double C4 = 0.923879532511287; |
| 1934 static const double C5 = 0.881921264348355; |
| 1935 static const double C6 = 0.831469612302545; |
| 1936 static const double C7 = 0.773010453362737; |
| 1937 static const double C8 = 0.707106781186548; |
| 1938 static const double C9 = 0.634393284163646; |
| 1939 static const double C10 = 0.555570233019602; |
| 1940 static const double C11 = 0.471396736825998; |
| 1941 static const double C12 = 0.38268343236509; |
| 1942 static const double C13 = 0.290284677254462; |
| 1943 static const double C14 = 0.195090322016128; |
| 1944 static const double C15 = 0.098017140329561; |
| 1945 |
| 1946 vp9_clear_system_state(); // Make it simd safe : __asm emms; |
| 1947 { |
| 1948 double step[16]; |
| 1949 double intermediate[16]; |
| 1950 double temp1, temp2; |
| 1951 |
| 1952 // step 1 |
| 1953 step[ 0] = input[0] + input[15]; |
| 1954 step[ 1] = input[1] + input[14]; |
| 1955 step[ 2] = input[2] + input[13]; |
| 1956 step[ 3] = input[3] + input[12]; |
| 1957 step[ 4] = input[4] + input[11]; |
| 1958 step[ 5] = input[5] + input[10]; |
| 1959 step[ 6] = input[6] + input[ 9]; |
| 1960 step[ 7] = input[7] + input[ 8]; |
| 1961 step[ 8] = input[7] - input[ 8]; |
| 1962 step[ 9] = input[6] - input[ 9]; |
| 1963 step[10] = input[5] - input[10]; |
| 1964 step[11] = input[4] - input[11]; |
| 1965 step[12] = input[3] - input[12]; |
| 1966 step[13] = input[2] - input[13]; |
| 1967 step[14] = input[1] - input[14]; |
| 1968 step[15] = input[0] - input[15]; |
| 1969 |
| 1970 // step 2 |
| 1971 output[0] = step[0] + step[7]; |
| 1972 output[1] = step[1] + step[6]; |
| 1973 output[2] = step[2] + step[5]; |
| 1974 output[3] = step[3] + step[4]; |
| 1975 output[4] = step[3] - step[4]; |
| 1976 output[5] = step[2] - step[5]; |
| 1977 output[6] = step[1] - step[6]; |
| 1978 output[7] = step[0] - step[7]; |
| 1979 |
| 1980 temp1 = step[ 8]*C7; |
| 1981 temp2 = step[15]*C9; |
| 1982 output[ 8] = temp1 + temp2; |
| 1983 |
| 1984 temp1 = step[ 9]*C11; |
| 1985 temp2 = step[14]*C5; |
| 1986 output[ 9] = temp1 - temp2; |
| 1987 |
| 1988 temp1 = step[10]*C3; |
| 1989 temp2 = step[13]*C13; |
| 1990 output[10] = temp1 + temp2; |
| 1991 |
| 1992 temp1 = step[11]*C15; |
| 1993 temp2 = step[12]*C1; |
| 1994 output[11] = temp1 - temp2; |
| 1995 |
| 1996 temp1 = step[11]*C1; |
| 1997 temp2 = step[12]*C15; |
| 1998 output[12] = temp2 + temp1; |
| 1999 |
| 2000 temp1 = step[10]*C13; |
| 2001 temp2 = step[13]*C3; |
| 2002 output[13] = temp2 - temp1; |
| 2003 |
| 2004 temp1 = step[ 9]*C5; |
| 2005 temp2 = step[14]*C11; |
| 2006 output[14] = temp2 + temp1; |
| 2007 |
| 2008 temp1 = step[ 8]*C9; |
| 2009 temp2 = step[15]*C7; |
| 2010 output[15] = temp2 - temp1; |
| 2011 |
| 2012 // step 3 |
| 2013 step[ 0] = output[0] + output[3]; |
| 2014 step[ 1] = output[1] + output[2]; |
| 2015 step[ 2] = output[1] - output[2]; |
| 2016 step[ 3] = output[0] - output[3]; |
| 2017 |
| 2018 temp1 = output[4]*C14; |
| 2019 temp2 = output[7]*C2; |
| 2020 step[ 4] = temp1 + temp2; |
| 2021 |
| 2022 temp1 = output[5]*C10; |
| 2023 temp2 = output[6]*C6; |
| 2024 step[ 5] = temp1 + temp2; |
| 2025 |
| 2026 temp1 = output[5]*C6; |
| 2027 temp2 = output[6]*C10; |
| 2028 step[ 6] = temp2 - temp1; |
| 2029 |
| 2030 temp1 = output[4]*C2; |
| 2031 temp2 = output[7]*C14; |
| 2032 step[ 7] = temp2 - temp1; |
| 2033 |
| 2034 step[ 8] = output[ 8] + output[11]; |
| 2035 step[ 9] = output[ 9] + output[10]; |
| 2036 step[10] = output[ 9] - output[10]; |
| 2037 step[11] = output[ 8] - output[11]; |
| 2038 |
| 2039 step[12] = output[12] + output[15]; |
| 2040 step[13] = output[13] + output[14]; |
| 2041 step[14] = output[13] - output[14]; |
| 2042 step[15] = output[12] - output[15]; |
| 2043 |
| 2044 // step 4 |
| 2045 output[ 0] = (step[ 0] + step[ 1]); |
| 2046 output[ 8] = (step[ 0] - step[ 1]); |
| 2047 |
| 2048 temp1 = step[2]*C12; |
| 2049 temp2 = step[3]*C4; |
| 2050 temp1 = temp1 + temp2; |
| 2051 output[ 4] = 2*(temp1*C8); |
| 2052 |
| 2053 temp1 = step[2]*C4; |
| 2054 temp2 = step[3]*C12; |
| 2055 temp1 = temp2 - temp1; |
| 2056 output[12] = 2*(temp1*C8); |
| 2057 |
| 2058 output[ 2] = 2*((step[4] + step[ 5])*C8); |
| 2059 output[14] = 2*((step[7] - step[ 6])*C8); |
| 2060 |
| 2061 temp1 = step[4] - step[5]; |
| 2062 temp2 = step[6] + step[7]; |
| 2063 output[ 6] = (temp1 + temp2); |
| 2064 output[10] = (temp1 - temp2); |
| 2065 |
| 2066 intermediate[8] = step[8] + step[14]; |
| 2067 intermediate[9] = step[9] + step[15]; |
| 2068 |
| 2069 temp1 = intermediate[8]*C12; |
| 2070 temp2 = intermediate[9]*C4; |
| 2071 temp1 = temp1 - temp2; |
| 2072 output[3] = 2*(temp1*C8); |
| 2073 |
| 2074 temp1 = intermediate[8]*C4; |
| 2075 temp2 = intermediate[9]*C12; |
| 2076 temp1 = temp2 + temp1; |
| 2077 output[13] = 2*(temp1*C8); |
| 2078 |
| 2079 output[ 9] = 2*((step[10] + step[11])*C8); |
| 2080 |
| 2081 intermediate[11] = step[10] - step[11]; |
| 2082 intermediate[12] = step[12] + step[13]; |
| 2083 intermediate[13] = step[12] - step[13]; |
| 2084 intermediate[14] = step[ 8] - step[14]; |
| 2085 intermediate[15] = step[ 9] - step[15]; |
| 2086 |
| 2087 output[15] = (intermediate[11] + intermediate[12]); |
| 2088 output[ 1] = -(intermediate[11] - intermediate[12]); |
| 2089 |
| 2090 output[ 7] = 2*(intermediate[13]*C8); |
| 2091 |
| 2092 temp1 = intermediate[14]*C12; |
| 2093 temp2 = intermediate[15]*C4; |
| 2094 temp1 = temp1 - temp2; |
| 2095 output[11] = -2*(temp1*C8); |
| 2096 |
| 2097 temp1 = intermediate[14]*C4; |
| 2098 temp2 = intermediate[15]*C12; |
| 2099 temp1 = temp2 + temp1; |
| 2100 output[ 5] = 2*(temp1*C8); |
| 2101 } |
| 2102 vp9_clear_system_state(); // Make it simd safe : __asm emms; |
| 2103 } |
| 2104 |
| 2105 static void vp9_short_fdct16x16_c_f(short *input, short *out, int pitch, |
| 2106 int scale) { |
| 2107 vp9_clear_system_state(); // Make it simd safe : __asm emms; |
| 2108 { |
| 2109 int shortpitch = pitch >> 1; |
| 2110 int i, j; |
| 2111 double output[256]; |
| 2112 // First transform columns |
| 2113 for (i = 0; i < 16; i++) { |
| 2114 double temp_in[16], temp_out[16]; |
| 2115 for (j = 0; j < 16; j++) |
| 2116 temp_in[j] = input[j*shortpitch + i]; |
| 2117 dct16x16_1d_f(temp_in, temp_out); |
| 2118 for (j = 0; j < 16; j++) |
| 2119 output[j*16 + i] = temp_out[j]; |
| 2120 } |
| 2121 // Then transform rows |
| 2122 for (i = 0; i < 16; ++i) { |
| 2123 double temp_in[16], temp_out[16]; |
| 2124 for (j = 0; j < 16; ++j) |
| 2125 temp_in[j] = output[j + i*16]; |
| 2126 dct16x16_1d_f(temp_in, temp_out); |
| 2127 for (j = 0; j < 16; ++j) |
| 2128 output[j + i*16] = temp_out[j]; |
| 2129 } |
| 2130 // Scale by some magic number |
| 2131 for (i = 0; i < 256; i++) |
| 2132 out[i] = (short)round(output[i] / (2 << scale)); |
| 2133 } |
| 2134 vp9_clear_system_state(); // Make it simd safe : __asm emms; |
| 2135 } |
| 2136 |
| 2137 void vp9_short_fdct8x8_c_f(short *block, short *coefs, int pitch, int scale) { |
| 2138 int j1, i, j, k; |
| 2139 float b[8]; |
| 2140 float b1[8]; |
| 2141 float d[8][8]; |
| 2142 float f0 = (float) .7071068; |
| 2143 float f1 = (float) .4903926; |
| 2144 float f2 = (float) .4619398; |
| 2145 float f3 = (float) .4157348; |
| 2146 float f4 = (float) .3535534; |
| 2147 float f5 = (float) .2777851; |
| 2148 float f6 = (float) .1913417; |
| 2149 float f7 = (float) .0975452; |
| 2150 pitch = pitch / 2; |
| 2151 for (i = 0, k = 0; i < 8; i++, k += pitch) { |
| 2152 for (j = 0; j < 8; j++) { |
| 2153 b[j] = (float)(block[k + j] << (3 - scale)); |
| 2154 } |
| 2155 /* Horizontal transform */ |
| 2156 for (j = 0; j < 4; j++) { |
| 2157 j1 = 7 - j; |
| 2158 b1[j] = b[j] + b[j1]; |
| 2159 b1[j1] = b[j] - b[j1]; |
| 2160 } |
| 2161 b[0] = b1[0] + b1[3]; |
| 2162 b[1] = b1[1] + b1[2]; |
| 2163 b[2] = b1[1] - b1[2]; |
| 2164 b[3] = b1[0] - b1[3]; |
| 2165 b[4] = b1[4]; |
| 2166 b[5] = (b1[6] - b1[5]) * f0; |
| 2167 b[6] = (b1[6] + b1[5]) * f0; |
| 2168 b[7] = b1[7]; |
| 2169 d[i][0] = (b[0] + b[1]) * f4; |
| 2170 d[i][4] = (b[0] - b[1]) * f4; |
| 2171 d[i][2] = b[2] * f6 + b[3] * f2; |
| 2172 d[i][6] = b[3] * f6 - b[2] * f2; |
| 2173 b1[4] = b[4] + b[5]; |
| 2174 b1[7] = b[7] + b[6]; |
| 2175 b1[5] = b[4] - b[5]; |
| 2176 b1[6] = b[7] - b[6]; |
| 2177 d[i][1] = b1[4] * f7 + b1[7] * f1; |
| 2178 d[i][5] = b1[5] * f3 + b1[6] * f5; |
| 2179 d[i][7] = b1[7] * f7 - b1[4] * f1; |
| 2180 d[i][3] = b1[6] * f3 - b1[5] * f5; |
| 2181 } |
| 2182 /* Vertical transform */ |
| 2183 for (i = 0; i < 8; i++) { |
| 2184 for (j = 0; j < 4; j++) { |
| 2185 j1 = 7 - j; |
| 2186 b1[j] = d[j][i] + d[j1][i]; |
| 2187 b1[j1] = d[j][i] - d[j1][i]; |
| 2188 } |
| 2189 b[0] = b1[0] + b1[3]; |
| 2190 b[1] = b1[1] + b1[2]; |
| 2191 b[2] = b1[1] - b1[2]; |
| 2192 b[3] = b1[0] - b1[3]; |
| 2193 b[4] = b1[4]; |
| 2194 b[5] = (b1[6] - b1[5]) * f0; |
| 2195 b[6] = (b1[6] + b1[5]) * f0; |
| 2196 b[7] = b1[7]; |
| 2197 d[0][i] = (b[0] + b[1]) * f4; |
| 2198 d[4][i] = (b[0] - b[1]) * f4; |
| 2199 d[2][i] = b[2] * f6 + b[3] * f2; |
| 2200 d[6][i] = b[3] * f6 - b[2] * f2; |
| 2201 b1[4] = b[4] + b[5]; |
| 2202 b1[7] = b[7] + b[6]; |
| 2203 b1[5] = b[4] - b[5]; |
| 2204 b1[6] = b[7] - b[6]; |
| 2205 d[1][i] = b1[4] * f7 + b1[7] * f1; |
| 2206 d[5][i] = b1[5] * f3 + b1[6] * f5; |
| 2207 d[7][i] = b1[7] * f7 - b1[4] * f1; |
| 2208 d[3][i] = b1[6] * f3 - b1[5] * f5; |
| 2209 } |
| 2210 for (i = 0; i < 8; i++) { |
| 2211 for (j = 0; j < 8; j++) { |
| 2212 *(coefs + j + i * 8) = (short) floor(d[i][j] + 0.5); |
| 2213 } |
| 2214 } |
| 2215 return; |
| 2216 } |
| 2217 |
| 2218 #define divide_bits(d, n) ((n) < 0 ? (d) << (n) : (d) >> (n)) |
| 2219 |
| 2220 #if DWTDCT_TYPE == DWTDCT16X16_LEAN |
| 2221 |
| 2222 void vp9_short_fdct32x32_c(short *input, short *out, int pitch) { |
| 2223 // assume out is a 32x32 buffer |
| 2224 short buffer[16 * 16]; |
| 2225 int i, j; |
| 2226 const int short_pitch = pitch >> 1; |
| 2227 #if DWT_TYPE == 26 |
| 2228 dyadic_analyze_26(1, 32, 32, input, short_pitch, out, 32); |
| 2229 #elif DWT_TYPE == 97 |
| 2230 dyadic_analyze_97(1, 32, 32, input, short_pitch, out, 32); |
| 2231 #elif DWT_TYPE == 53 |
| 2232 dyadic_analyze_53(1, 32, 32, input, short_pitch, out, 32); |
| 2233 #endif |
| 2234 // TODO(debargha): Implement more efficiently by adding output pitch |
| 2235 // argument to the dct16x16 function |
| 2236 vp9_short_fdct16x16_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS); |
| 2237 for (i = 0; i < 16; ++i) |
| 2238 vpx_memcpy(out + i * 32, buffer + i * 16, sizeof(short) * 16); |
| 2239 for (i = 0; i < 16; ++i) { |
| 2240 for (j = 16; j < 32; ++j) { |
| 2241 out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2); |
| 2242 } |
| 2243 } |
| 2244 for (i = 16; i < 32; ++i) { |
| 2245 for (j = 0; j < 32; ++j) { |
| 2246 out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2); |
| 2247 } |
| 2248 } |
| 2249 } |
| 2250 |
| 2251 #elif DWTDCT_TYPE == DWTDCT16X16 |
| 2252 |
| 2253 void vp9_short_fdct32x32_c(short *input, short *out, int pitch) { |
| 2254 // assume out is a 32x32 buffer |
| 2255 short buffer[16 * 16]; |
| 2256 int i, j; |
| 2257 const int short_pitch = pitch >> 1; |
| 2258 #if DWT_TYPE == 26 |
| 2259 dyadic_analyze_26(1, 32, 32, input, short_pitch, out, 32); |
| 2260 #elif DWT_TYPE == 97 |
| 2261 dyadic_analyze_97(1, 32, 32, input, short_pitch, out, 32); |
| 2262 #elif DWT_TYPE == 53 |
| 2263 dyadic_analyze_53(1, 32, 32, input, short_pitch, out, 32); |
| 2264 #endif |
| 2265 // TODO(debargha): Implement more efficiently by adding output pitch |
| 2266 // argument to the dct16x16 function |
| 2267 vp9_short_fdct16x16_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS); |
| 2268 for (i = 0; i < 16; ++i) |
| 2269 vpx_memcpy(out + i * 32, buffer + i * 16, sizeof(short) * 16); |
| 2270 vp9_short_fdct16x16_c_f(out + 16, buffer, 64, 1 + DWT_PRECISION_BITS); |
| 2271 for (i = 0; i < 16; ++i) |
| 2272 vpx_memcpy(out + i * 32 + 16, buffer + i * 16, sizeof(short) * 16); |
| 2273 |
| 2274 vp9_short_fdct16x16_c_f(out + 32 * 16, buffer, 64, 1 + DWT_PRECISION_BITS); |
| 2275 for (i = 0; i < 16; ++i) |
| 2276 vpx_memcpy(out + i * 32 + 32 * 16, buffer + i * 16, sizeof(short) * 16); |
| 2277 |
| 2278 vp9_short_fdct16x16_c_f(out + 33 * 16, buffer, 64, 1 + DWT_PRECISION_BITS); |
| 2279 for (i = 0; i < 16; ++i) |
| 2280 vpx_memcpy(out + i * 32 + 33 * 16, buffer + i * 16, sizeof(short) * 16); |
| 2281 } |
| 2282 |
| 2283 #elif DWTDCT_TYPE == DWTDCT8X8 |
| 2284 |
| 2285 void vp9_short_fdct32x32_c(short *input, short *out, int pitch) { |
| 2286 // assume out is a 32x32 buffer |
| 2287 short buffer[8 * 8]; |
| 2288 int i, j; |
| 2289 const int short_pitch = pitch >> 1; |
| 2290 #if DWT_TYPE == 26 |
| 2291 dyadic_analyze_26(2, 32, 32, input, short_pitch, out, 32); |
| 2292 #elif DWT_TYPE == 97 |
| 2293 dyadic_analyze_97(2, 32, 32, input, short_pitch, out, 32); |
| 2294 #elif DWT_TYPE == 53 |
| 2295 dyadic_analyze_53(2, 32, 32, input, short_pitch, out, 32); |
| 2296 #endif |
| 2297 // TODO(debargha): Implement more efficiently by adding output pitch |
| 2298 // argument to the dct16x16 function |
| 2299 vp9_short_fdct8x8_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS); |
| 2300 for (i = 0; i < 8; ++i) |
| 2301 vpx_memcpy(out + i * 32, buffer + i * 8, sizeof(short) * 8); |
| 2302 |
| 2303 vp9_short_fdct8x8_c_f(out + 8, buffer, 64, 1 + DWT_PRECISION_BITS); |
| 2304 for (i = 0; i < 8; ++i) |
| 2305 vpx_memcpy(out + i * 32 + 8, buffer + i * 8, sizeof(short) * 8); |
| 2306 |
| 2307 vp9_short_fdct8x8_c_f(out + 32 * 8, buffer, 64, 1 + DWT_PRECISION_BITS); |
| 2308 for (i = 0; i < 8; ++i) |
| 2309 vpx_memcpy(out + i * 32 + 32 * 8, buffer + i * 8, sizeof(short) * 8); |
| 2310 |
| 2311 vp9_short_fdct8x8_c_f(out + 33 * 8, buffer, 64, 1 + DWT_PRECISION_BITS); |
| 2312 for (i = 0; i < 8; ++i) |
| 2313 vpx_memcpy(out + i * 32 + 33 * 8, buffer + i * 8, sizeof(short) * 8); |
| 2314 |
| 2315 for (i = 0; i < 16; ++i) { |
| 2316 for (j = 16; j < 32; ++j) { |
| 2317 out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2); |
| 2318 } |
| 2319 } |
| 2320 for (i = 16; i < 32; ++i) { |
| 2321 for (j = 0; j < 32; ++j) { |
| 2322 out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2); |
| 2323 } |
| 2324 } |
| 2325 } |
| 2326 |
| 2327 #endif |
| 2328 |
| 2329 #if CONFIG_TX64X64 |
| 2330 void vp9_short_fdct64x64_c(short *input, short *out, int pitch) { |
| 2331 // assume out is a 64x64 buffer |
| 2332 short buffer[16 * 16]; |
| 2333 int i, j; |
| 2334 const int short_pitch = pitch >> 1; |
| 2335 #if DWT_TYPE == 26 |
| 2336 dyadic_analyze_26(2, 64, 64, input, short_pitch, out, 64); |
| 2337 #elif DWT_TYPE == 97 |
| 2338 dyadic_analyze_97(2, 64, 64, input, short_pitch, out, 64); |
| 2339 #elif DWT_TYPE == 53 |
| 2340 dyadic_analyze_53(2, 64, 64, input, short_pitch, out, 64); |
| 2341 #endif |
| 2342 // TODO(debargha): Implement more efficiently by adding output pitch |
| 2343 // argument to the dct16x16 function |
| 2344 vp9_short_fdct16x16_c_f(out, buffer, 128, 2 + DWT_PRECISION_BITS); |
| 2345 for (i = 0; i < 16; ++i) |
| 2346 vpx_memcpy(out + i * 64, buffer + i * 16, sizeof(short) * 16); |
| 2347 |
| 2348 #if DWTDCT_TYPE == DWTDCT16X16_LEAN |
| 2349 for (i = 0; i < 16; ++i) { |
| 2350 for (j = 16; j < 48; ++j) { |
| 2351 out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1); |
| 2352 } |
| 2353 } |
| 2354 for (i = 16; i < 64; ++i) { |
| 2355 for (j = 0; j < 64; ++j) { |
| 2356 out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1); |
| 2357 } |
| 2358 } |
| 2359 #elif DWTDCT_TYPE == DWTDCT16X16 |
| 2360 vp9_short_fdct16x16_c_f(out + 16, buffer, 128, 2 + DWT_PRECISION_BITS); |
| 2361 for (i = 0; i < 16; ++i) |
| 2362 vpx_memcpy(out + i * 64 + 16, buffer + i * 16, sizeof(short) * 16); |
| 2363 |
| 2364 vp9_short_fdct16x16_c_f(out + 64 * 16, buffer, 128, 2 + DWT_PRECISION_BITS); |
| 2365 for (i = 0; i < 16; ++i) |
| 2366 vpx_memcpy(out + i * 64 + 64 * 16, buffer + i * 16, sizeof(short) * 16); |
| 2367 |
| 2368 vp9_short_fdct16x16_c_f(out + 65 * 16, buffer, 128, 2 + DWT_PRECISION_BITS); |
| 2369 for (i = 0; i < 16; ++i) |
| 2370 vpx_memcpy(out + i * 64 + 65 * 16, buffer + i * 16, sizeof(short) * 16); |
| 2371 |
| 2372 // There is no dct used on the highest bands for now. |
| 2373 // Need to scale these coeffs by a factor of 2/2^DWT_PRECISION_BITS |
| 2374 // TODO(debargha): experiment with turning these coeffs to 0 |
| 2375 for (i = 0; i < 32; ++i) { |
| 2376 for (j = 32; j < 64; ++j) { |
| 2377 out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1); |
| 2378 } |
| 2379 } |
| 2380 for (i = 32; i < 64; ++i) { |
| 2381 for (j = 0; j < 64; ++j) { |
| 2382 out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1); |
| 2383 } |
| 2384 } |
| 2385 #endif // DWTDCT_TYPE |
| 2386 } |
| 2387 #endif // CONFIG_TX64X64 |
| 2388 #endif // CONFIG_DWTDCTHYBRID |
OLD | NEW |