| OLD | NEW | 
|---|
| 1 /* | 1 /* | 
| 2  * ARMv7 NEON optimizations for libjpeg-turbo | 2  * ARMv7 NEON optimizations for libjpeg-turbo | 
| 3  * | 3  * | 
| 4  * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). | 4  * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). | 
| 5  * All rights reserved. | 5  * All rights reserved. | 
| 6  * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> | 6  * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> | 
| 7  * | 7  * | 
| 8  * This software is provided 'as-is', without any express or implied | 8  * This software is provided 'as-is', without any express or implied | 
| 9  * warranty.  In no event will the authors be held liable for any damages | 9  * warranty.  In no event will the authors be held liable for any damages | 
| 10  * arising from the use of this software. | 10  * arising from the use of this software. | 
| (...skipping 23 matching lines...) Expand all  Loading... | 
| 34 | 34 | 
| 35 | 35 | 
| 36 #define RESPECT_STRICT_ALIGNMENT 1 | 36 #define RESPECT_STRICT_ALIGNMENT 1 | 
| 37 | 37 | 
| 38 | 38 | 
| 39 /*****************************************************************************/ | 39 /*****************************************************************************/ | 
| 40 | 40 | 
| 41 /* Supplementary macro for setting function attributes */ | 41 /* Supplementary macro for setting function attributes */ | 
| 42 .macro asm_function fname | 42 .macro asm_function fname | 
| 43 #ifdef __APPLE__ | 43 #ifdef __APPLE__ | 
| 44     .func _\fname |  | 
| 45     .globl _\fname | 44     .globl _\fname | 
| 46 _\fname: | 45 _\fname: | 
| 47 #else | 46 #else | 
| 48     .func \fname |  | 
| 49     .global \fname | 47     .global \fname | 
| 50 #ifdef __ELF__ | 48 #ifdef __ELF__ | 
| 51     .hidden \fname | 49     .hidden \fname | 
| 52     .type \fname, %function | 50     .type \fname, %function | 
| 53 #endif | 51 #endif | 
| 54 \fname: | 52 \fname: | 
| 55 #endif | 53 #endif | 
| 56 .endm | 54 .endm | 
| 57 | 55 | 
| 58 /* Transpose a block of 4x4 coefficients in four 64-bit registers */ | 56 /* Transpose a block of 4x4 coefficients in four 64-bit registers */ | 
| (...skipping 604 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 663     .unreq          ROW3L | 661     .unreq          ROW3L | 
| 664     .unreq          ROW3R | 662     .unreq          ROW3R | 
| 665     .unreq          ROW4L | 663     .unreq          ROW4L | 
| 666     .unreq          ROW4R | 664     .unreq          ROW4R | 
| 667     .unreq          ROW5L | 665     .unreq          ROW5L | 
| 668     .unreq          ROW5R | 666     .unreq          ROW5R | 
| 669     .unreq          ROW6L | 667     .unreq          ROW6L | 
| 670     .unreq          ROW6R | 668     .unreq          ROW6R | 
| 671     .unreq          ROW7L | 669     .unreq          ROW7L | 
| 672     .unreq          ROW7R | 670     .unreq          ROW7R | 
| 673 .endfunc |  | 
| 674 | 671 | 
| 675 | 672 | 
| 676 /*****************************************************************************/ | 673 /*****************************************************************************/ | 
| 677 | 674 | 
| 678 /* | 675 /* | 
| 679  * jsimd_idct_ifast_neon | 676  * jsimd_idct_ifast_neon | 
| 680  * | 677  * | 
| 681  * This function contains a fast, not so accurate integer implementation of | 678  * This function contains a fast, not so accurate integer implementation of | 
| 682  * the inverse DCT (Discrete Cosine Transform). It uses the same calculations | 679  * the inverse DCT (Discrete Cosine Transform). It uses the same calculations | 
| 683  * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' | 680  * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' | 
| (...skipping 204 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 888     bx              lr | 885     bx              lr | 
| 889 | 886 | 
| 890     .unreq          DCT_TABLE | 887     .unreq          DCT_TABLE | 
| 891     .unreq          COEF_BLOCK | 888     .unreq          COEF_BLOCK | 
| 892     .unreq          OUTPUT_BUF | 889     .unreq          OUTPUT_BUF | 
| 893     .unreq          OUTPUT_COL | 890     .unreq          OUTPUT_COL | 
| 894     .unreq          TMP1 | 891     .unreq          TMP1 | 
| 895     .unreq          TMP2 | 892     .unreq          TMP2 | 
| 896     .unreq          TMP3 | 893     .unreq          TMP3 | 
| 897     .unreq          TMP4 | 894     .unreq          TMP4 | 
| 898 .endfunc |  | 
| 899 | 895 | 
| 900 | 896 | 
| 901 /*****************************************************************************/ | 897 /*****************************************************************************/ | 
| 902 | 898 | 
| 903 /* | 899 /* | 
| 904  * jsimd_idct_4x4_neon | 900  * jsimd_idct_4x4_neon | 
| 905  * | 901  * | 
| 906  * This function contains inverse-DCT code for getting reduced-size | 902  * This function contains inverse-DCT code for getting reduced-size | 
| 907  * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations | 903  * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations | 
| 908  * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' | 904  * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' | 
| (...skipping 192 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 1101     bx              lr | 1097     bx              lr | 
| 1102 | 1098 | 
| 1103     .unreq          DCT_TABLE | 1099     .unreq          DCT_TABLE | 
| 1104     .unreq          COEF_BLOCK | 1100     .unreq          COEF_BLOCK | 
| 1105     .unreq          OUTPUT_BUF | 1101     .unreq          OUTPUT_BUF | 
| 1106     .unreq          OUTPUT_COL | 1102     .unreq          OUTPUT_COL | 
| 1107     .unreq          TMP1 | 1103     .unreq          TMP1 | 
| 1108     .unreq          TMP2 | 1104     .unreq          TMP2 | 
| 1109     .unreq          TMP3 | 1105     .unreq          TMP3 | 
| 1110     .unreq          TMP4 | 1106     .unreq          TMP4 | 
| 1111 .endfunc |  | 
| 1112 | 1107 | 
| 1113 .purgem idct_helper | 1108 .purgem idct_helper | 
| 1114 | 1109 | 
| 1115 | 1110 | 
| 1116 /*****************************************************************************/ | 1111 /*****************************************************************************/ | 
| 1117 | 1112 | 
| 1118 /* | 1113 /* | 
| 1119  * jsimd_idct_2x2_neon | 1114  * jsimd_idct_2x2_neon | 
| 1120  * | 1115  * | 
| 1121  * This function contains inverse-DCT code for getting reduced-size | 1116  * This function contains inverse-DCT code for getting reduced-size | 
| (...skipping 134 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 1256 | 1251 | 
| 1257     vpop            {d8-d15} | 1252     vpop            {d8-d15} | 
| 1258     bx              lr | 1253     bx              lr | 
| 1259 | 1254 | 
| 1260     .unreq          DCT_TABLE | 1255     .unreq          DCT_TABLE | 
| 1261     .unreq          COEF_BLOCK | 1256     .unreq          COEF_BLOCK | 
| 1262     .unreq          OUTPUT_BUF | 1257     .unreq          OUTPUT_BUF | 
| 1263     .unreq          OUTPUT_COL | 1258     .unreq          OUTPUT_COL | 
| 1264     .unreq          TMP1 | 1259     .unreq          TMP1 | 
| 1265     .unreq          TMP2 | 1260     .unreq          TMP2 | 
| 1266 .endfunc |  | 
| 1267 | 1261 | 
| 1268 .purgem idct_helper | 1262 .purgem idct_helper | 
| 1269 | 1263 | 
| 1270 | 1264 | 
| 1271 /*****************************************************************************/ | 1265 /*****************************************************************************/ | 
| 1272 | 1266 | 
| 1273 /* | 1267 /* | 
| 1274  * jsimd_ycc_extrgb_convert_neon | 1268  * jsimd_ycc_extrgb_convert_neon | 
| 1275  * jsimd_ycc_extbgr_convert_neon | 1269  * jsimd_ycc_extbgr_convert_neon | 
| 1276  * jsimd_ycc_extrgbx_convert_neon | 1270  * jsimd_ycc_extrgbx_convert_neon | 
| (...skipping 263 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 1540     .unreq          OUTPUT_BUF | 1534     .unreq          OUTPUT_BUF | 
| 1541     .unreq          NUM_ROWS | 1535     .unreq          NUM_ROWS | 
| 1542     .unreq          INPUT_BUF0 | 1536     .unreq          INPUT_BUF0 | 
| 1543     .unreq          INPUT_BUF1 | 1537     .unreq          INPUT_BUF1 | 
| 1544     .unreq          INPUT_BUF2 | 1538     .unreq          INPUT_BUF2 | 
| 1545     .unreq          RGB | 1539     .unreq          RGB | 
| 1546     .unreq          Y | 1540     .unreq          Y | 
| 1547     .unreq          U | 1541     .unreq          U | 
| 1548     .unreq          V | 1542     .unreq          V | 
| 1549     .unreq          N | 1543     .unreq          N | 
| 1550 .endfunc |  | 
| 1551 | 1544 | 
| 1552 .purgem do_yuv_to_rgb | 1545 .purgem do_yuv_to_rgb | 
| 1553 .purgem do_yuv_to_rgb_stage1 | 1546 .purgem do_yuv_to_rgb_stage1 | 
| 1554 .purgem do_yuv_to_rgb_stage2 | 1547 .purgem do_yuv_to_rgb_stage2 | 
| 1555 .purgem do_yuv_to_rgb_stage2_store_load_stage1 | 1548 .purgem do_yuv_to_rgb_stage2_store_load_stage1 | 
| 1556 | 1549 | 
| 1557 .endm | 1550 .endm | 
| 1558 | 1551 | 
| 1559 /*--------------------------------- id ----- bpp R  G  B */ | 1552 /*--------------------------------- id ----- bpp R  G  B */ | 
| 1560 generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, 1, 2 | 1553 generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, 1, 2 | 
| (...skipping 290 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 1851     .unreq          INPUT_BUF | 1844     .unreq          INPUT_BUF | 
| 1852     .unreq          NUM_ROWS | 1845     .unreq          NUM_ROWS | 
| 1853     .unreq          OUTPUT_BUF0 | 1846     .unreq          OUTPUT_BUF0 | 
| 1854     .unreq          OUTPUT_BUF1 | 1847     .unreq          OUTPUT_BUF1 | 
| 1855     .unreq          OUTPUT_BUF2 | 1848     .unreq          OUTPUT_BUF2 | 
| 1856     .unreq          RGB | 1849     .unreq          RGB | 
| 1857     .unreq          Y | 1850     .unreq          Y | 
| 1858     .unreq          U | 1851     .unreq          U | 
| 1859     .unreq          V | 1852     .unreq          V | 
| 1860     .unreq          N | 1853     .unreq          N | 
| 1861 .endfunc |  | 
| 1862 | 1854 | 
| 1863 .purgem do_rgb_to_yuv | 1855 .purgem do_rgb_to_yuv | 
| 1864 .purgem do_rgb_to_yuv_stage1 | 1856 .purgem do_rgb_to_yuv_stage1 | 
| 1865 .purgem do_rgb_to_yuv_stage2 | 1857 .purgem do_rgb_to_yuv_stage2 | 
| 1866 .purgem do_rgb_to_yuv_stage2_store_load_stage1 | 1858 .purgem do_rgb_to_yuv_stage2_store_load_stage1 | 
| 1867 | 1859 | 
| 1868 .endm | 1860 .endm | 
| 1869 | 1861 | 
| 1870 /*--------------------------------- id ----- bpp R  G  B */ | 1862 /*--------------------------------- id ----- bpp R  G  B */ | 
| 1871 generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2 | 1863 generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2 | 
| (...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 1933     pop             {r4, r5} | 1925     pop             {r4, r5} | 
| 1934     bx              lr | 1926     bx              lr | 
| 1935 | 1927 | 
| 1936     .unreq          SAMPLE_DATA | 1928     .unreq          SAMPLE_DATA | 
| 1937     .unreq          START_COL | 1929     .unreq          START_COL | 
| 1938     .unreq          WORKSPACE | 1930     .unreq          WORKSPACE | 
| 1939     .unreq          TMP1 | 1931     .unreq          TMP1 | 
| 1940     .unreq          TMP2 | 1932     .unreq          TMP2 | 
| 1941     .unreq          TMP3 | 1933     .unreq          TMP3 | 
| 1942     .unreq          TMP4 | 1934     .unreq          TMP4 | 
| 1943 .endfunc |  | 
| 1944 | 1935 | 
| 1945 | 1936 | 
| 1946 /*****************************************************************************/ | 1937 /*****************************************************************************/ | 
| 1947 | 1938 | 
| 1948 /* | 1939 /* | 
| 1949  * jsimd_fdct_ifast_neon | 1940  * jsimd_fdct_ifast_neon | 
| 1950  * | 1941  * | 
| 1951  * This function contains a fast, not so accurate integer implementation of | 1942  * This function contains a fast, not so accurate integer implementation of | 
| 1952  * the forward DCT (Discrete Cosine Transform). It uses the same calculations | 1943  * the forward DCT (Discrete Cosine Transform). It uses the same calculations | 
| 1953  * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' | 1944  * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' | 
| (...skipping 103 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 2057     vst1.16         {d16, d17, d18, d19}, [DATA, :128]! | 2048     vst1.16         {d16, d17, d18, d19}, [DATA, :128]! | 
| 2058     vst1.16         {d20, d21, d22, d23}, [DATA, :128]! | 2049     vst1.16         {d20, d21, d22, d23}, [DATA, :128]! | 
| 2059     vst1.16         {d24, d25, d26, d27}, [DATA, :128]! | 2050     vst1.16         {d24, d25, d26, d27}, [DATA, :128]! | 
| 2060     vst1.16         {d28, d29, d30, d31}, [DATA, :128] | 2051     vst1.16         {d28, d29, d30, d31}, [DATA, :128] | 
| 2061 | 2052 | 
| 2062     vpop            {d8-d15} | 2053     vpop            {d8-d15} | 
| 2063     bx              lr | 2054     bx              lr | 
| 2064 | 2055 | 
| 2065     .unreq          DATA | 2056     .unreq          DATA | 
| 2066     .unreq          TMP | 2057     .unreq          TMP | 
| 2067 .endfunc |  | 
| 2068 | 2058 | 
| 2069 | 2059 | 
| 2070 /*****************************************************************************/ | 2060 /*****************************************************************************/ | 
| 2071 | 2061 | 
| 2072 /* | 2062 /* | 
| 2073  * GLOBAL(void) | 2063  * GLOBAL(void) | 
| 2074  * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors, | 2064  * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors, | 
| 2075  *                      DCTELEM * workspace); | 2065  *                      DCTELEM * workspace); | 
| 2076  * | 2066  * | 
| 2077  * Note: the code uses 2 stage pipelining in order to improve instructions | 2067  * Note: the code uses 2 stage pipelining in order to improve instructions | 
| (...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 2159 | 2149 | 
| 2160     bx              lr /* return */ | 2150     bx              lr /* return */ | 
| 2161 | 2151 | 
| 2162     .unreq          COEF_BLOCK | 2152     .unreq          COEF_BLOCK | 
| 2163     .unreq          DIVISORS | 2153     .unreq          DIVISORS | 
| 2164     .unreq          WORKSPACE | 2154     .unreq          WORKSPACE | 
| 2165     .unreq          RECIPROCAL | 2155     .unreq          RECIPROCAL | 
| 2166     .unreq          CORRECTION | 2156     .unreq          CORRECTION | 
| 2167     .unreq          SHIFT | 2157     .unreq          SHIFT | 
| 2168     .unreq          LOOP_COUNT | 2158     .unreq          LOOP_COUNT | 
| 2169 .endfunc |  | 
| 2170 | 2159 | 
| 2171 | 2160 | 
| 2172 /*****************************************************************************/ | 2161 /*****************************************************************************/ | 
| 2173 | 2162 | 
| 2174 /* | 2163 /* | 
| 2175  * GLOBAL(void) | 2164  * GLOBAL(void) | 
| 2176  * jsimd_h2v1_fancy_upsample_neon (int          max_v_samp_factor, | 2165  * jsimd_h2v1_fancy_upsample_neon (int          max_v_samp_factor, | 
| 2177  *                                 JDIMENSION   downsampled_width, | 2166  *                                 JDIMENSION   downsampled_width, | 
| 2178  *                                 JSAMPARRAY   input_data, | 2167  *                                 JSAMPARRAY   input_data, | 
| 2179  *                                 JSAMPARRAY * output_data_ptr); | 2168  *                                 JSAMPARRAY * output_data_ptr); | 
| (...skipping 214 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 2394     .unreq          DOWNSAMPLED_WIDTH | 2383     .unreq          DOWNSAMPLED_WIDTH | 
| 2395     .unreq          INPUT_DATA | 2384     .unreq          INPUT_DATA | 
| 2396     .unreq          OUTPUT_DATA_PTR | 2385     .unreq          OUTPUT_DATA_PTR | 
| 2397     .unreq          OUTPUT_DATA | 2386     .unreq          OUTPUT_DATA | 
| 2398 | 2387 | 
| 2399     .unreq          OUTPTR | 2388     .unreq          OUTPTR | 
| 2400     .unreq          INPTR | 2389     .unreq          INPTR | 
| 2401     .unreq          WIDTH | 2390     .unreq          WIDTH | 
| 2402     .unreq          TMP | 2391     .unreq          TMP | 
| 2403 | 2392 | 
| 2404 .endfunc |  | 
| 2405 | 2393 | 
| 2406 .purgem upsample16 | 2394 .purgem upsample16 | 
| 2407 .purgem upsample32 | 2395 .purgem upsample32 | 
| 2408 .purgem upsample_row | 2396 .purgem upsample_row | 
| OLD | NEW | 
|---|