OLD | NEW |
1 /* | 1 /* |
2 * ARMv7 NEON optimizations for libjpeg-turbo | 2 * ARMv7 NEON optimizations for libjpeg-turbo |
3 * | 3 * |
4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). | 4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). |
5 * All rights reserved. | 5 * All rights reserved. |
6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> | 6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> |
| 7 * Copyright (C) 2014 Linaro Limited. All Rights Reserved. |
7 * | 8 * |
8 * This software is provided 'as-is', without any express or implied | 9 * This software is provided 'as-is', without any express or implied |
9 * warranty. In no event will the authors be held liable for any damages | 10 * warranty. In no event will the authors be held liable for any damages |
10 * arising from the use of this software. | 11 * arising from the use of this software. |
11 * | 12 * |
12 * Permission is granted to anyone to use this software for any purpose, | 13 * Permission is granted to anyone to use this software for any purpose, |
13 * including commercial applications, and to alter it and redistribute it | 14 * including commercial applications, and to alter it and redistribute it |
14 * freely, subject to the following restrictions: | 15 * freely, subject to the following restrictions: |
15 * | 16 * |
16 * 1. The origin of this software must not be misrepresented; you must not | 17 * 1. The origin of this software must not be misrepresented; you must not |
(...skipping 1322 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1339 vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! | 1340 vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! |
1340 vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! | 1341 vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! |
1341 .elseif \size == 2 | 1342 .elseif \size == 2 |
1342 vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! | 1343 vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! |
1343 vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! | 1344 vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! |
1344 .elseif \size == 1 | 1345 .elseif \size == 1 |
1345 vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! | 1346 vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! |
1346 .else | 1347 .else |
1347 .error unsupported macroblock size | 1348 .error unsupported macroblock size |
1348 .endif | 1349 .endif |
| 1350 .elseif \bpp == 16 |
| 1351 .if \size == 8 |
| 1352 vst1.16 {q15}, [RGB]! |
| 1353 .elseif \size == 4 |
| 1354 vst1.16 {d30}, [RGB]! |
| 1355 .elseif \size == 2 |
| 1356 vst1.16 {d31[0]}, [RGB]! |
| 1357 vst1.16 {d31[1]}, [RGB]! |
| 1358 .elseif \size == 1 |
| 1359 vst1.16 {d31[2]}, [RGB]! |
| 1360 .else |
| 1361 .error unsupported macroblock size |
| 1362 .endif |
1349 .else | 1363 .else |
1350 .error unsupported bpp | 1364 .error unsupported bpp |
1351 .endif | 1365 .endif |
1352 .endm | 1366 .endm |
1353 | 1367 |
1354 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs | 1368 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs |
1355 | 1369 |
1356 /* | 1370 /* |
1357 * 2 stage pipelined YCbCr->RGB conversion | 1371 * 2 stage pipelined YCbCr->RGB conversion |
1358 */ | 1372 */ |
(...skipping 11 matching lines...) Expand all Loading... |
1370 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ | 1384 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ |
1371 .endm | 1385 .endm |
1372 | 1386 |
1373 .macro do_yuv_to_rgb_stage2 | 1387 .macro do_yuv_to_rgb_stage2 |
1374 vrshrn.s32 d20, q10, #15 | 1388 vrshrn.s32 d20, q10, #15 |
1375 vrshrn.s32 d21, q11, #15 | 1389 vrshrn.s32 d21, q11, #15 |
1376 vrshrn.s32 d24, q12, #14 | 1390 vrshrn.s32 d24, q12, #14 |
1377 vrshrn.s32 d25, q13, #14 | 1391 vrshrn.s32 d25, q13, #14 |
1378 vrshrn.s32 d28, q14, #14 | 1392 vrshrn.s32 d28, q14, #14 |
1379 vrshrn.s32 d29, q15, #14 | 1393 vrshrn.s32 d29, q15, #14 |
1380 vaddw.u8 q10, q10, d0 | 1394 vaddw.u8 q11, q10, d0 |
1381 vaddw.u8 q12, q12, d0 | 1395 vaddw.u8 q12, q12, d0 |
1382 vaddw.u8 q14, q14, d0 | 1396 vaddw.u8 q14, q14, d0 |
1383 vqmovun.s16 d1\g_offs, q10 | 1397 .if \bpp != 16 |
| 1398 vqmovun.s16 d1\g_offs, q11 |
1384 vqmovun.s16 d1\r_offs, q12 | 1399 vqmovun.s16 d1\r_offs, q12 |
1385 vqmovun.s16 d1\b_offs, q14 | 1400 vqmovun.s16 d1\b_offs, q14 |
| 1401 .else /* rgb565 */ |
| 1402 vqshlu.s16 q13, q11, #8 |
| 1403 vqshlu.s16 q15, q12, #8 |
| 1404 vqshlu.s16 q14, q14, #8 |
| 1405 vsri.u16 q15, q13, #5 |
| 1406 vsri.u16 q15, q14, #11 |
| 1407 .endif |
1386 .endm | 1408 .endm |
1387 | 1409 |
1388 .macro do_yuv_to_rgb_stage2_store_load_stage1 | 1410 .macro do_yuv_to_rgb_stage2_store_load_stage1 |
1389 vld1.8 {d4}, [U, :64]! | 1411 /* "do_yuv_to_rgb_stage2" and "store" */ |
1390 vrshrn.s32 d20, q10, #15 | 1412 vrshrn.s32 d20, q10, #15 |
| 1413 /* "load" and "do_yuv_to_rgb_stage1" */ |
| 1414 pld [U, #64] |
1391 vrshrn.s32 d21, q11, #15 | 1415 vrshrn.s32 d21, q11, #15 |
| 1416 pld [V, #64] |
1392 vrshrn.s32 d24, q12, #14 | 1417 vrshrn.s32 d24, q12, #14 |
1393 vrshrn.s32 d25, q13, #14 | 1418 vrshrn.s32 d25, q13, #14 |
| 1419 vld1.8 {d4}, [U, :64]! |
1394 vrshrn.s32 d28, q14, #14 | 1420 vrshrn.s32 d28, q14, #14 |
1395 vld1.8 {d5}, [V, :64]! | 1421 vld1.8 {d5}, [V, :64]! |
1396 vrshrn.s32 d29, q15, #14 | 1422 vrshrn.s32 d29, q15, #14 |
1397 vaddw.u8 q10, q10, d0 | 1423 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ |
| 1424 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ |
| 1425 vaddw.u8 q11, q10, d0 |
| 1426 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ |
| 1427 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ |
1398 vaddw.u8 q12, q12, d0 | 1428 vaddw.u8 q12, q12, d0 |
1399 vaddw.u8 q14, q14, d0 | 1429 vaddw.u8 q14, q14, d0 |
1400 vqmovun.s16 d1\g_offs, q10 | 1430 .if \bpp != 16 /**************** rgb24/rgb32 *********************************/ |
| 1431 vqmovun.s16 d1\g_offs, q11 |
| 1432 pld [Y, #64] |
| 1433 vqmovun.s16 d1\r_offs, q12 |
1401 vld1.8 {d0}, [Y, :64]! | 1434 vld1.8 {d0}, [Y, :64]! |
1402 vqmovun.s16 d1\r_offs, q12 | |
1403 pld [U, #64] | |
1404 pld [V, #64] | |
1405 pld [Y, #64] | |
1406 vqmovun.s16 d1\b_offs, q14 | 1435 vqmovun.s16 d1\b_offs, q14 |
1407 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ | |
1408 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ | |
1409 do_store \bpp, 8 | |
1410 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ | |
1411 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ | |
1412 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ | 1436 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ |
1413 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ | 1437 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ |
| 1438 do_store \bpp, 8 |
1414 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ | 1439 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ |
1415 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ | 1440 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ |
1416 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ | 1441 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ |
1417 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ | 1442 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ |
| 1443 .else /**************************** rgb565 ***********************************/ |
| 1444 vqshlu.s16 q13, q11, #8 |
| 1445 pld [Y, #64] |
| 1446 vqshlu.s16 q15, q12, #8 |
| 1447 vqshlu.s16 q14, q14, #8 |
| 1448 vld1.8 {d0}, [Y, :64]! |
| 1449 vmull.s16 q11, d7, d1[1] |
| 1450 vmlal.s16 q11, d9, d1[2] |
| 1451 vsri.u16 q15, q13, #5 |
| 1452 vmull.s16 q12, d8, d1[0] |
| 1453 vsri.u16 q15, q14, #11 |
| 1454 vmull.s16 q13, d9, d1[0] |
| 1455 vmull.s16 q14, d6, d1[3] |
| 1456 do_store \bpp, 8 |
| 1457 vmull.s16 q15, d7, d1[3] |
| 1458 .endif |
1418 .endm | 1459 .endm |
1419 | 1460 |
1420 .macro do_yuv_to_rgb | 1461 .macro do_yuv_to_rgb |
1421 do_yuv_to_rgb_stage1 | 1462 do_yuv_to_rgb_stage1 |
1422 do_yuv_to_rgb_stage2 | 1463 do_yuv_to_rgb_stage2 |
1423 .endm | 1464 .endm |
1424 | 1465 |
1425 /* Apple gas crashes on adrl, work around that by using adr. | 1466 /* Apple gas crashes on adrl, work around that by using adr. |
1426 * But this requires a copy of these constants for each function. | 1467 * But this requires a copy of these constants for each function. |
1427 */ | 1468 */ |
(...skipping 121 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1549 | 1590 |
1550 .endm | 1591 .endm |
1551 | 1592 |
1552 /*--------------------------------- id ----- bpp R G B */ | 1593 /*--------------------------------- id ----- bpp R G B */ |
1553 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2 | 1594 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2 |
1554 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0 | 1595 generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0 |
1555 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2 | 1596 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2 |
1556 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0 | 1597 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0 |
1557 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1 | 1598 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1 |
1558 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3 | 1599 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3 |
| 1600 generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, 0, 0 |
1559 | 1601 |
1560 .purgem do_load | 1602 .purgem do_load |
1561 .purgem do_store | 1603 .purgem do_store |
1562 | 1604 |
1563 | 1605 |
1564 /*****************************************************************************/ | 1606 /*****************************************************************************/ |
1565 | 1607 |
1566 /* | 1608 /* |
1567 * jsimd_extrgb_ycc_convert_neon | 1609 * jsimd_extrgb_ycc_convert_neon |
1568 * jsimd_extbgr_ycc_convert_neon | 1610 * jsimd_extbgr_ycc_convert_neon |
(...skipping 818 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2387 | 2429 |
2388 .unreq OUTPTR | 2430 .unreq OUTPTR |
2389 .unreq INPTR | 2431 .unreq INPTR |
2390 .unreq WIDTH | 2432 .unreq WIDTH |
2391 .unreq TMP | 2433 .unreq TMP |
2392 | 2434 |
2393 | 2435 |
2394 .purgem upsample16 | 2436 .purgem upsample16 |
2395 .purgem upsample32 | 2437 .purgem upsample32 |
2396 .purgem upsample_row | 2438 .purgem upsample_row |
OLD | NEW |