OLD | NEW |
1 /* | 1 /* |
2 * ARMv7 NEON optimizations for libjpeg-turbo | 2 * ARMv7 NEON optimizations for libjpeg-turbo |
3 * | 3 * |
4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). | 4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). |
5 * All rights reserved. | 5 * All rights reserved. |
6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> | 6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> |
7 * | 7 * |
8 * This software is provided 'as-is', without any express or implied | 8 * This software is provided 'as-is', without any express or implied |
9 * warranty. In no event will the authors be held liable for any damages | 9 * warranty. In no event will the authors be held liable for any damages |
10 * arising from the use of this software. | 10 * arising from the use of this software. |
(...skipping 23 matching lines...) Expand all Loading... |
34 | 34 |
35 | 35 |
36 #define RESPECT_STRICT_ALIGNMENT 1 | 36 #define RESPECT_STRICT_ALIGNMENT 1 |
37 | 37 |
38 | 38 |
39 /*****************************************************************************/ | 39 /*****************************************************************************/ |
40 | 40 |
41 /* Supplementary macro for setting function attributes */ | 41 /* Supplementary macro for setting function attributes */ |
42 .macro asm_function fname | 42 .macro asm_function fname |
43 #ifdef __APPLE__ | 43 #ifdef __APPLE__ |
44 .func _\fname | |
45 .globl _\fname | 44 .globl _\fname |
46 _\fname: | 45 _\fname: |
47 #else | 46 #else |
48 .func \fname | |
49 .global \fname | 47 .global \fname |
50 #ifdef __ELF__ | 48 #ifdef __ELF__ |
51 .hidden \fname | 49 .hidden \fname |
52 .type \fname, %function | 50 .type \fname, %function |
53 #endif | 51 #endif |
54 \fname: | 52 \fname: |
55 #endif | 53 #endif |
56 .endm | 54 .endm |
57 | 55 |
58 /* Transpose a block of 4x4 coefficients in four 64-bit registers */ | 56 /* Transpose a block of 4x4 coefficients in four 64-bit registers */ |
(...skipping 604 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
663 .unreq ROW3L | 661 .unreq ROW3L |
664 .unreq ROW3R | 662 .unreq ROW3R |
665 .unreq ROW4L | 663 .unreq ROW4L |
666 .unreq ROW4R | 664 .unreq ROW4R |
667 .unreq ROW5L | 665 .unreq ROW5L |
668 .unreq ROW5R | 666 .unreq ROW5R |
669 .unreq ROW6L | 667 .unreq ROW6L |
670 .unreq ROW6R | 668 .unreq ROW6R |
671 .unreq ROW7L | 669 .unreq ROW7L |
672 .unreq ROW7R | 670 .unreq ROW7R |
673 .endfunc | |
674 | 671 |
675 | 672 |
676 /*****************************************************************************/ | 673 /*****************************************************************************/ |
677 | 674 |
678 /* | 675 /* |
679 * jsimd_idct_ifast_neon | 676 * jsimd_idct_ifast_neon |
680 * | 677 * |
681 * This function contains a fast, not so accurate integer implementation of | 678 * This function contains a fast, not so accurate integer implementation of |
682 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations | 679 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations |
683 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' | 680 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' |
(...skipping 204 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
888 bx lr | 885 bx lr |
889 | 886 |
890 .unreq DCT_TABLE | 887 .unreq DCT_TABLE |
891 .unreq COEF_BLOCK | 888 .unreq COEF_BLOCK |
892 .unreq OUTPUT_BUF | 889 .unreq OUTPUT_BUF |
893 .unreq OUTPUT_COL | 890 .unreq OUTPUT_COL |
894 .unreq TMP1 | 891 .unreq TMP1 |
895 .unreq TMP2 | 892 .unreq TMP2 |
896 .unreq TMP3 | 893 .unreq TMP3 |
897 .unreq TMP4 | 894 .unreq TMP4 |
898 .endfunc | |
899 | 895 |
900 | 896 |
901 /*****************************************************************************/ | 897 /*****************************************************************************/ |
902 | 898 |
903 /* | 899 /* |
904 * jsimd_idct_4x4_neon | 900 * jsimd_idct_4x4_neon |
905 * | 901 * |
906 * This function contains inverse-DCT code for getting reduced-size | 902 * This function contains inverse-DCT code for getting reduced-size |
907 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations | 903 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations |
908 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' | 904 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' |
(...skipping 192 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1101 bx lr | 1097 bx lr |
1102 | 1098 |
1103 .unreq DCT_TABLE | 1099 .unreq DCT_TABLE |
1104 .unreq COEF_BLOCK | 1100 .unreq COEF_BLOCK |
1105 .unreq OUTPUT_BUF | 1101 .unreq OUTPUT_BUF |
1106 .unreq OUTPUT_COL | 1102 .unreq OUTPUT_COL |
1107 .unreq TMP1 | 1103 .unreq TMP1 |
1108 .unreq TMP2 | 1104 .unreq TMP2 |
1109 .unreq TMP3 | 1105 .unreq TMP3 |
1110 .unreq TMP4 | 1106 .unreq TMP4 |
1111 .endfunc | |
1112 | 1107 |
1113 .purgem idct_helper | 1108 .purgem idct_helper |
1114 | 1109 |
1115 | 1110 |
1116 /*****************************************************************************/ | 1111 /*****************************************************************************/ |
1117 | 1112 |
1118 /* | 1113 /* |
1119 * jsimd_idct_2x2_neon | 1114 * jsimd_idct_2x2_neon |
1120 * | 1115 * |
1121 * This function contains inverse-DCT code for getting reduced-size | 1116 * This function contains inverse-DCT code for getting reduced-size |
(...skipping 134 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1256 | 1251 |
1257 vpop {d8-d15} | 1252 vpop {d8-d15} |
1258 bx lr | 1253 bx lr |
1259 | 1254 |
1260 .unreq DCT_TABLE | 1255 .unreq DCT_TABLE |
1261 .unreq COEF_BLOCK | 1256 .unreq COEF_BLOCK |
1262 .unreq OUTPUT_BUF | 1257 .unreq OUTPUT_BUF |
1263 .unreq OUTPUT_COL | 1258 .unreq OUTPUT_COL |
1264 .unreq TMP1 | 1259 .unreq TMP1 |
1265 .unreq TMP2 | 1260 .unreq TMP2 |
1266 .endfunc | |
1267 | 1261 |
1268 .purgem idct_helper | 1262 .purgem idct_helper |
1269 | 1263 |
1270 | 1264 |
1271 /*****************************************************************************/ | 1265 /*****************************************************************************/ |
1272 | 1266 |
1273 /* | 1267 /* |
1274 * jsimd_ycc_extrgb_convert_neon | 1268 * jsimd_ycc_extrgb_convert_neon |
1275 * jsimd_ycc_extbgr_convert_neon | 1269 * jsimd_ycc_extbgr_convert_neon |
1276 * jsimd_ycc_extrgbx_convert_neon | 1270 * jsimd_ycc_extrgbx_convert_neon |
(...skipping 263 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1540 .unreq OUTPUT_BUF | 1534 .unreq OUTPUT_BUF |
1541 .unreq NUM_ROWS | 1535 .unreq NUM_ROWS |
1542 .unreq INPUT_BUF0 | 1536 .unreq INPUT_BUF0 |
1543 .unreq INPUT_BUF1 | 1537 .unreq INPUT_BUF1 |
1544 .unreq INPUT_BUF2 | 1538 .unreq INPUT_BUF2 |
1545 .unreq RGB | 1539 .unreq RGB |
1546 .unreq Y | 1540 .unreq Y |
1547 .unreq U | 1541 .unreq U |
1548 .unreq V | 1542 .unreq V |
1549 .unreq N | 1543 .unreq N |
1550 .endfunc | |
1551 | 1544 |
1552 .purgem do_yuv_to_rgb | 1545 .purgem do_yuv_to_rgb |
1553 .purgem do_yuv_to_rgb_stage1 | 1546 .purgem do_yuv_to_rgb_stage1 |
1554 .purgem do_yuv_to_rgb_stage2 | 1547 .purgem do_yuv_to_rgb_stage2 |
1555 .purgem do_yuv_to_rgb_stage2_store_load_stage1 | 1548 .purgem do_yuv_to_rgb_stage2_store_load_stage1 |
1556 | 1549 |
1557 .endm | 1550 .endm |
1558 | 1551 |
1559 /*--------------------------------- id ----- bpp R G B */ | 1552 /*--------------------------------- id ----- bpp R G B */ |
1560 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2 | 1553 generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2 |
(...skipping 290 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1851 .unreq INPUT_BUF | 1844 .unreq INPUT_BUF |
1852 .unreq NUM_ROWS | 1845 .unreq NUM_ROWS |
1853 .unreq OUTPUT_BUF0 | 1846 .unreq OUTPUT_BUF0 |
1854 .unreq OUTPUT_BUF1 | 1847 .unreq OUTPUT_BUF1 |
1855 .unreq OUTPUT_BUF2 | 1848 .unreq OUTPUT_BUF2 |
1856 .unreq RGB | 1849 .unreq RGB |
1857 .unreq Y | 1850 .unreq Y |
1858 .unreq U | 1851 .unreq U |
1859 .unreq V | 1852 .unreq V |
1860 .unreq N | 1853 .unreq N |
1861 .endfunc | |
1862 | 1854 |
1863 .purgem do_rgb_to_yuv | 1855 .purgem do_rgb_to_yuv |
1864 .purgem do_rgb_to_yuv_stage1 | 1856 .purgem do_rgb_to_yuv_stage1 |
1865 .purgem do_rgb_to_yuv_stage2 | 1857 .purgem do_rgb_to_yuv_stage2 |
1866 .purgem do_rgb_to_yuv_stage2_store_load_stage1 | 1858 .purgem do_rgb_to_yuv_stage2_store_load_stage1 |
1867 | 1859 |
1868 .endm | 1860 .endm |
1869 | 1861 |
1870 /*--------------------------------- id ----- bpp R G B */ | 1862 /*--------------------------------- id ----- bpp R G B */ |
1871 generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2 | 1863 generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2 |
(...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1933 pop {r4, r5} | 1925 pop {r4, r5} |
1934 bx lr | 1926 bx lr |
1935 | 1927 |
1936 .unreq SAMPLE_DATA | 1928 .unreq SAMPLE_DATA |
1937 .unreq START_COL | 1929 .unreq START_COL |
1938 .unreq WORKSPACE | 1930 .unreq WORKSPACE |
1939 .unreq TMP1 | 1931 .unreq TMP1 |
1940 .unreq TMP2 | 1932 .unreq TMP2 |
1941 .unreq TMP3 | 1933 .unreq TMP3 |
1942 .unreq TMP4 | 1934 .unreq TMP4 |
1943 .endfunc | |
1944 | 1935 |
1945 | 1936 |
1946 /*****************************************************************************/ | 1937 /*****************************************************************************/ |
1947 | 1938 |
1948 /* | 1939 /* |
1949 * jsimd_fdct_ifast_neon | 1940 * jsimd_fdct_ifast_neon |
1950 * | 1941 * |
1951 * This function contains a fast, not so accurate integer implementation of | 1942 * This function contains a fast, not so accurate integer implementation of |
1952 * the forward DCT (Discrete Cosine Transform). It uses the same calculations | 1943 * the forward DCT (Discrete Cosine Transform). It uses the same calculations |
1953 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' | 1944 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' |
(...skipping 103 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2057 vst1.16 {d16, d17, d18, d19}, [DATA, :128]! | 2048 vst1.16 {d16, d17, d18, d19}, [DATA, :128]! |
2058 vst1.16 {d20, d21, d22, d23}, [DATA, :128]! | 2049 vst1.16 {d20, d21, d22, d23}, [DATA, :128]! |
2059 vst1.16 {d24, d25, d26, d27}, [DATA, :128]! | 2050 vst1.16 {d24, d25, d26, d27}, [DATA, :128]! |
2060 vst1.16 {d28, d29, d30, d31}, [DATA, :128] | 2051 vst1.16 {d28, d29, d30, d31}, [DATA, :128] |
2061 | 2052 |
2062 vpop {d8-d15} | 2053 vpop {d8-d15} |
2063 bx lr | 2054 bx lr |
2064 | 2055 |
2065 .unreq DATA | 2056 .unreq DATA |
2066 .unreq TMP | 2057 .unreq TMP |
2067 .endfunc | |
2068 | 2058 |
2069 | 2059 |
2070 /*****************************************************************************/ | 2060 /*****************************************************************************/ |
2071 | 2061 |
2072 /* | 2062 /* |
2073 * GLOBAL(void) | 2063 * GLOBAL(void) |
2074 * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors, | 2064 * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors, |
2075 * DCTELEM * workspace); | 2065 * DCTELEM * workspace); |
2076 * | 2066 * |
2077 * Note: the code uses 2 stage pipelining in order to improve instructions | 2067 * Note: the code uses 2 stage pipelining in order to improve instructions |
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2159 | 2149 |
2160 bx lr /* return */ | 2150 bx lr /* return */ |
2161 | 2151 |
2162 .unreq COEF_BLOCK | 2152 .unreq COEF_BLOCK |
2163 .unreq DIVISORS | 2153 .unreq DIVISORS |
2164 .unreq WORKSPACE | 2154 .unreq WORKSPACE |
2165 .unreq RECIPROCAL | 2155 .unreq RECIPROCAL |
2166 .unreq CORRECTION | 2156 .unreq CORRECTION |
2167 .unreq SHIFT | 2157 .unreq SHIFT |
2168 .unreq LOOP_COUNT | 2158 .unreq LOOP_COUNT |
2169 .endfunc | |
2170 | 2159 |
2171 | 2160 |
2172 /*****************************************************************************/ | 2161 /*****************************************************************************/ |
2173 | 2162 |
2174 /* | 2163 /* |
2175 * GLOBAL(void) | 2164 * GLOBAL(void) |
2176 * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor, | 2165 * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor, |
2177 * JDIMENSION downsampled_width, | 2166 * JDIMENSION downsampled_width, |
2178 * JSAMPARRAY input_data, | 2167 * JSAMPARRAY input_data, |
2179 * JSAMPARRAY * output_data_ptr); | 2168 * JSAMPARRAY * output_data_ptr); |
(...skipping 214 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2394 .unreq DOWNSAMPLED_WIDTH | 2383 .unreq DOWNSAMPLED_WIDTH |
2395 .unreq INPUT_DATA | 2384 .unreq INPUT_DATA |
2396 .unreq OUTPUT_DATA_PTR | 2385 .unreq OUTPUT_DATA_PTR |
2397 .unreq OUTPUT_DATA | 2386 .unreq OUTPUT_DATA |
2398 | 2387 |
2399 .unreq OUTPTR | 2388 .unreq OUTPTR |
2400 .unreq INPTR | 2389 .unreq INPTR |
2401 .unreq WIDTH | 2390 .unreq WIDTH |
2402 .unreq TMP | 2391 .unreq TMP |
2403 | 2392 |
2404 .endfunc | |
2405 | 2393 |
2406 .purgem upsample16 | 2394 .purgem upsample16 |
2407 .purgem upsample32 | 2395 .purgem upsample32 |
2408 .purgem upsample_row | 2396 .purgem upsample_row |
OLD | NEW |