| OLD | NEW |
| 1 /* | 1 /* |
| 2 * ARMv7 NEON optimizations for libjpeg-turbo | 2 * ARMv7 NEON optimizations for libjpeg-turbo |
| 3 * | 3 * |
| 4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). | 4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). |
| 5 * All rights reserved. | 5 * All rights reserved. |
| 6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> | 6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> |
| 7 * | 7 * |
| 8 * This software is provided 'as-is', without any express or implied | 8 * This software is provided 'as-is', without any express or implied |
| 9 * warranty. In no event will the authors be held liable for any damages | 9 * warranty. In no event will the authors be held liable for any damages |
| 10 * arising from the use of this software. | 10 * arising from the use of this software. |
| (...skipping 23 matching lines...) Expand all Loading... |
| 34 | 34 |
| 35 | 35 |
| 36 #define RESPECT_STRICT_ALIGNMENT 1 | 36 #define RESPECT_STRICT_ALIGNMENT 1 |
| 37 | 37 |
| 38 | 38 |
| 39 /*****************************************************************************/ | 39 /*****************************************************************************/ |
| 40 | 40 |
| 41 /* Supplementary macro for setting function attributes */ | 41 /* Supplementary macro for setting function attributes */ |
| 42 .macro asm_function fname | 42 .macro asm_function fname |
| 43 #ifdef __APPLE__ | 43 #ifdef __APPLE__ |
| 44 .func _\fname | |
| 45 .globl _\fname | 44 .globl _\fname |
| 46 _\fname: | 45 _\fname: |
| 47 #else | 46 #else |
| 48 .func \fname | |
| 49 .global \fname | 47 .global \fname |
| 50 #ifdef __ELF__ | 48 #ifdef __ELF__ |
| 51 .hidden \fname | 49 .hidden \fname |
| 52 .type \fname, %function | 50 .type \fname, %function |
| 53 #endif | 51 #endif |
| 54 \fname: | 52 \fname: |
| 55 #endif | 53 #endif |
| 56 .endm | 54 .endm |
| 57 | 55 |
| 58 /* Transpose a block of 4x4 coefficients in four 64-bit registers */ | 56 /* Transpose a block of 4x4 coefficients in four 64-bit registers */ |
| (...skipping 604 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 663 .unreq ROW3L | 661 .unreq ROW3L |
| 664 .unreq ROW3R | 662 .unreq ROW3R |
| 665 .unreq ROW4L | 663 .unreq ROW4L |
| 666 .unreq ROW4R | 664 .unreq ROW4R |
| 667 .unreq ROW5L | 665 .unreq ROW5L |
| 668 .unreq ROW5R | 666 .unreq ROW5R |
| 669 .unreq ROW6L | 667 .unreq ROW6L |
| 670 .unreq ROW6R | 668 .unreq ROW6R |
| 671 .unreq ROW7L | 669 .unreq ROW7L |
| 672 .unreq ROW7R | 670 .unreq ROW7R |
| 673 .endfunc | |
| 674 | 671 |
| 675 | 672 |
| 676 /*****************************************************************************/ | 673 /*****************************************************************************/ |
| 677 | 674 |
| 678 /* | 675 /* |
| 679 * jsimd_idct_ifast_neon | 676 * jsimd_idct_ifast_neon |
| 680 * | 677 * |
| 681 * This function contains a fast, not so accurate integer implementation of | 678 * This function contains a fast, not so accurate integer implementation of |
| 682 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations | 679 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations |
| 683 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' | 680 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' |
| (...skipping 204 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 888 bx lr | 885 bx lr |
| 889 | 886 |
| 890 .unreq DCT_TABLE | 887 .unreq DCT_TABLE |
| 891 .unreq COEF_BLOCK | 888 .unreq COEF_BLOCK |
| 892 .unreq OUTPUT_BUF | 889 .unreq OUTPUT_BUF |
| 893 .unreq OUTPUT_COL | 890 .unreq OUTPUT_COL |
| 894 .unreq TMP1 | 891 .unreq TMP1 |
| 895 .unreq TMP2 | 892 .unreq TMP2 |
| 896 .unreq TMP3 | 893 .unreq TMP3 |
| 897 .unreq TMP4 | 894 .unreq TMP4 |
| 898 .endfunc | |
| 899 | 895 |
| 900 | 896 |
| 901 /*****************************************************************************/ | 897 /*****************************************************************************/ |
| 902 | 898 |
| 903 /* | 899 /* |
| 904 * jsimd_idct_4x4_neon | 900 * jsimd_idct_4x4_neon |
| 905 * | 901 * |
| 906 * This function contains inverse-DCT code for getting reduced-size | 902 * This function contains inverse-DCT code for getting reduced-size |
| 907 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations | 903 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations |
| 908 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' | 904 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' |
| (...skipping 192 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1101 bx lr | 1097 bx lr |
| 1102 | 1098 |
| 1103 .unreq DCT_TABLE | 1099 .unreq DCT_TABLE |
| 1104 .unreq COEF_BLOCK | 1100 .unreq COEF_BLOCK |
| 1105 .unreq OUTPUT_BUF | 1101 .unreq OUTPUT_BUF |
| 1106 .unreq OUTPUT_COL | 1102 .unreq OUTPUT_COL |
| 1107 .unreq TMP1 | 1103 .unreq TMP1 |
| 1108 .unreq TMP2 | 1104 .unreq TMP2 |
| 1109 .unreq TMP3 | 1105 .unreq TMP3 |
| 1110 .unreq TMP4 | 1106 .unreq TMP4 |
| 1111 .endfunc | |
| 1112 | 1107 |
| 1113 .purgem idct_helper | 1108 .purgem idct_helper |
| 1114 | 1109 |
| 1115 | 1110 |
| 1116 /*****************************************************************************/ | 1111 /*****************************************************************************/ |
| 1117 | 1112 |
| 1118 /* | 1113 /* |
| 1119 * jsimd_idct_2x2_neon | 1114 * jsimd_idct_2x2_neon |
| 1120 * | 1115 * |
| 1121 * This function contains inverse-DCT code for getting reduced-size | 1116 * This function contains inverse-DCT code for getting reduced-size |
| (...skipping 134 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1256 | 1251 |
| 1257 vpop {d8-d15} | 1252 vpop {d8-d15} |
| 1258 bx lr | 1253 bx lr |
| 1259 | 1254 |
| 1260 .unreq DCT_TABLE | 1255 .unreq DCT_TABLE |
| 1261 .unreq COEF_BLOCK | 1256 .unreq COEF_BLOCK |
| 1262 .unreq OUTPUT_BUF | 1257 .unreq OUTPUT_BUF |
| 1263 .unreq OUTPUT_COL | 1258 .unreq OUTPUT_COL |
| 1264 .unreq TMP1 | 1259 .unreq TMP1 |
| 1265 .unreq TMP2 | 1260 .unreq TMP2 |
| 1266 .endfunc | |
| 1267 | 1261 |
| 1268 .purgem idct_helper | 1262 .purgem idct_helper |
| 1269 | 1263 |
| 1270 | 1264 |
| 1271 /*****************************************************************************/ | 1265 /*****************************************************************************/ |
| 1272 | 1266 |
| 1273 /* | 1267 /* |
| 1274 * jsimd_ycc_extrgb_convert_neon | 1268 * jsimd_ycc_extrgb_convert_neon |
| 1275 * jsimd_ycc_extbgr_convert_neon | 1269 * jsimd_ycc_extbgr_convert_neon |
| 1276 * jsimd_ycc_extrgbx_convert_neon | 1270 * jsimd_ycc_extrgbx_convert_neon |
| (...skipping 263 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1540 .unreq OUTPUT_BUF | 1534 .unreq OUTPUT_BUF |
| 1541 .unreq NUM_ROWS | 1535 .unreq NUM_ROWS |
| 1542 .unreq INPUT_BUF0 | 1536 .unreq INPUT_BUF0 |
| 1543 .unreq INPUT_BUF1 | 1537 .unreq INPUT_BUF1 |
| 1544 .unreq INPUT_BUF2 | 1538 .unreq INPUT_BUF2 |
| 1545 .unreq RGB | 1539 .unreq RGB |
| 1546 .unreq Y | 1540 .unreq Y |
| 1547 .unreq U | 1541 .unreq U |
| 1548 .unreq V | 1542 .unreq V |
| 1549 .unreq N | 1543 .unreq N |
| 1550 .endfunc | |
| 1551 | 1544 |
| 1552 .purgem do_yuv_to_rgb | 1545 .purgem do_yuv_to_rgb |
| 1553 .purgem do_yuv_to_rgb_stage1 | 1546 .purgem do_yuv_to_rgb_stage1 |
| 1554 .purgem do_yuv_to_rgb_stage2 | 1547 .purgem do_yuv_to_rgb_stage2 |
| 1555 .purgem do_yuv_to_rgb_stage2_store_load_stage1 | 1548 .purgem do_yuv_to_rgb_stage2_store_load_stage1 |
| 1556 | 1549 |
| 1557 .endm | 1550 .endm |
| 1558 | 1551 |
| 1559 /*--------------------------------- id ----- bpp R G B */ | 1552 /*--------------------------------- id ----- bpp R G B */ |
| 1560 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2 | 1553 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2 |
| (...skipping 290 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1851 .unreq INPUT_BUF | 1844 .unreq INPUT_BUF |
| 1852 .unreq NUM_ROWS | 1845 .unreq NUM_ROWS |
| 1853 .unreq OUTPUT_BUF0 | 1846 .unreq OUTPUT_BUF0 |
| 1854 .unreq OUTPUT_BUF1 | 1847 .unreq OUTPUT_BUF1 |
| 1855 .unreq OUTPUT_BUF2 | 1848 .unreq OUTPUT_BUF2 |
| 1856 .unreq RGB | 1849 .unreq RGB |
| 1857 .unreq Y | 1850 .unreq Y |
| 1858 .unreq U | 1851 .unreq U |
| 1859 .unreq V | 1852 .unreq V |
| 1860 .unreq N | 1853 .unreq N |
| 1861 .endfunc | |
| 1862 | 1854 |
| 1863 .purgem do_rgb_to_yuv | 1855 .purgem do_rgb_to_yuv |
| 1864 .purgem do_rgb_to_yuv_stage1 | 1856 .purgem do_rgb_to_yuv_stage1 |
| 1865 .purgem do_rgb_to_yuv_stage2 | 1857 .purgem do_rgb_to_yuv_stage2 |
| 1866 .purgem do_rgb_to_yuv_stage2_store_load_stage1 | 1858 .purgem do_rgb_to_yuv_stage2_store_load_stage1 |
| 1867 | 1859 |
| 1868 .endm | 1860 .endm |
| 1869 | 1861 |
| 1870 /*--------------------------------- id ----- bpp R G B */ | 1862 /*--------------------------------- id ----- bpp R G B */ |
| 1871 generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2 | 1863 generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2 |
| (...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1933 pop {r4, r5} | 1925 pop {r4, r5} |
| 1934 bx lr | 1926 bx lr |
| 1935 | 1927 |
| 1936 .unreq SAMPLE_DATA | 1928 .unreq SAMPLE_DATA |
| 1937 .unreq START_COL | 1929 .unreq START_COL |
| 1938 .unreq WORKSPACE | 1930 .unreq WORKSPACE |
| 1939 .unreq TMP1 | 1931 .unreq TMP1 |
| 1940 .unreq TMP2 | 1932 .unreq TMP2 |
| 1941 .unreq TMP3 | 1933 .unreq TMP3 |
| 1942 .unreq TMP4 | 1934 .unreq TMP4 |
| 1943 .endfunc | |
| 1944 | 1935 |
| 1945 | 1936 |
| 1946 /*****************************************************************************/ | 1937 /*****************************************************************************/ |
| 1947 | 1938 |
| 1948 /* | 1939 /* |
| 1949 * jsimd_fdct_ifast_neon | 1940 * jsimd_fdct_ifast_neon |
| 1950 * | 1941 * |
| 1951 * This function contains a fast, not so accurate integer implementation of | 1942 * This function contains a fast, not so accurate integer implementation of |
| 1952 * the forward DCT (Discrete Cosine Transform). It uses the same calculations | 1943 * the forward DCT (Discrete Cosine Transform). It uses the same calculations |
| 1953 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' | 1944 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' |
| (...skipping 103 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2057 vst1.16 {d16, d17, d18, d19}, [DATA, :128]! | 2048 vst1.16 {d16, d17, d18, d19}, [DATA, :128]! |
| 2058 vst1.16 {d20, d21, d22, d23}, [DATA, :128]! | 2049 vst1.16 {d20, d21, d22, d23}, [DATA, :128]! |
| 2059 vst1.16 {d24, d25, d26, d27}, [DATA, :128]! | 2050 vst1.16 {d24, d25, d26, d27}, [DATA, :128]! |
| 2060 vst1.16 {d28, d29, d30, d31}, [DATA, :128] | 2051 vst1.16 {d28, d29, d30, d31}, [DATA, :128] |
| 2061 | 2052 |
| 2062 vpop {d8-d15} | 2053 vpop {d8-d15} |
| 2063 bx lr | 2054 bx lr |
| 2064 | 2055 |
| 2065 .unreq DATA | 2056 .unreq DATA |
| 2066 .unreq TMP | 2057 .unreq TMP |
| 2067 .endfunc | |
| 2068 | 2058 |
| 2069 | 2059 |
| 2070 /*****************************************************************************/ | 2060 /*****************************************************************************/ |
| 2071 | 2061 |
| 2072 /* | 2062 /* |
| 2073 * GLOBAL(void) | 2063 * GLOBAL(void) |
| 2074 * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors, | 2064 * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors, |
| 2075 * DCTELEM * workspace); | 2065 * DCTELEM * workspace); |
| 2076 * | 2066 * |
| 2077 * Note: the code uses 2 stage pipelining in order to improve instructions | 2067 * Note: the code uses 2 stage pipelining in order to improve instructions |
| (...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2159 | 2149 |
| 2160 bx lr /* return */ | 2150 bx lr /* return */ |
| 2161 | 2151 |
| 2162 .unreq COEF_BLOCK | 2152 .unreq COEF_BLOCK |
| 2163 .unreq DIVISORS | 2153 .unreq DIVISORS |
| 2164 .unreq WORKSPACE | 2154 .unreq WORKSPACE |
| 2165 .unreq RECIPROCAL | 2155 .unreq RECIPROCAL |
| 2166 .unreq CORRECTION | 2156 .unreq CORRECTION |
| 2167 .unreq SHIFT | 2157 .unreq SHIFT |
| 2168 .unreq LOOP_COUNT | 2158 .unreq LOOP_COUNT |
| 2169 .endfunc | |
| 2170 | 2159 |
| 2171 | 2160 |
| 2172 /*****************************************************************************/ | 2161 /*****************************************************************************/ |
| 2173 | 2162 |
| 2174 /* | 2163 /* |
| 2175 * GLOBAL(void) | 2164 * GLOBAL(void) |
| 2176 * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor, | 2165 * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor, |
| 2177 * JDIMENSION downsampled_width, | 2166 * JDIMENSION downsampled_width, |
| 2178 * JSAMPARRAY input_data, | 2167 * JSAMPARRAY input_data, |
| 2179 * JSAMPARRAY * output_data_ptr); | 2168 * JSAMPARRAY * output_data_ptr); |
| (...skipping 214 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2394 .unreq DOWNSAMPLED_WIDTH | 2383 .unreq DOWNSAMPLED_WIDTH |
| 2395 .unreq INPUT_DATA | 2384 .unreq INPUT_DATA |
| 2396 .unreq OUTPUT_DATA_PTR | 2385 .unreq OUTPUT_DATA_PTR |
| 2397 .unreq OUTPUT_DATA | 2386 .unreq OUTPUT_DATA |
| 2398 | 2387 |
| 2399 .unreq OUTPTR | 2388 .unreq OUTPTR |
| 2400 .unreq INPTR | 2389 .unreq INPTR |
| 2401 .unreq WIDTH | 2390 .unreq WIDTH |
| 2402 .unreq TMP | 2391 .unreq TMP |
| 2403 | 2392 |
| 2404 .endfunc | |
| 2405 | 2393 |
| 2406 .purgem upsample16 | 2394 .purgem upsample16 |
| 2407 .purgem upsample32 | 2395 .purgem upsample32 |
| 2408 .purgem upsample_row | 2396 .purgem upsample_row |
| OLD | NEW |