Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(91)

Side by Side Diff: google.patch

Issue 434123003: Add ARM64 SIMD support to libjpeg_turbo (Closed) Base URL: http://src.chromium.org/svn/trunk/deps/third_party/libjpeg_turbo
Patch Set: Add neon fixes Created 6 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « README.chromium ('k') | libjpeg.gyp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 Index: jdmarker.c 1 Index: jdmarker.c
2 =================================================================== 2 ===================================================================
3 --- jdmarker.c (revision 829) 3 --- jdmarker.c (revision 829)
4 +++ jdmarker.c (working copy) 4 +++ jdmarker.c (working copy)
5 @@ -910,7 +910,7 @@ 5 @@ -910,7 +910,7 @@
6 } 6 }
7 7
8 if (cinfo->marker->discarded_bytes != 0) { 8 if (cinfo->marker->discarded_bytes != 0) {
9 - WARNMS2(cinfo, JWRN_EXTRANEOUS_DATA, cinfo->marker->discarded_bytes, c); 9 - WARNMS2(cinfo, JWRN_EXTRANEOUS_DATA, cinfo->marker->discarded_bytes, c);
10 + TRACEMS2(cinfo, 1, JWRN_EXTRANEOUS_DATA, cinfo->marker->discarded_bytes, c) ; 10 + TRACEMS2(cinfo, 1, JWRN_EXTRANEOUS_DATA, cinfo->marker->discarded_bytes, c) ;
(...skipping 1920 matching lines...) Expand 10 before | Expand all | Expand 10 after
1931 code = dctbl->ehufco[nbits]; 1931 code = dctbl->ehufco[nbits];
1932 @@ -516,7 +546,7 @@ 1932 @@ -516,7 +546,7 @@
1933 temp ^= temp3; \ 1933 temp ^= temp3; \
1934 temp -= temp3; \ 1934 temp -= temp3; \
1935 temp2 += temp3; \ 1935 temp2 += temp3; \
1936 - nbits = jpeg_nbits_table[temp]; \ 1936 - nbits = jpeg_nbits_table[temp]; \
1937 + nbits = JPEG_NBITS_NONZERO(temp); \ 1937 + nbits = JPEG_NBITS_NONZERO(temp); \
1938 /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \ 1938 /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \
1939 while (r > 15) { \ 1939 while (r > 15) { \
1940 EMIT_BITS(code_0xf0, size_0xf0) \ 1940 EMIT_BITS(code_0xf0, size_0xf0) \
1941 Index: simd/jsimd_arm64.c
1942 ===================================================================
1943 --- /dev/null
1944 +++ simd/jsimd_arm64.c
1945 @@ -0,0 +1,544 @@
1946 +/*
1947 + * jsimd_arm64.c
1948 + *
1949 + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
1950 + * Copyright 2009-2011, 2013-2014 D. R. Commander
1951 + *
1952 + * Based on the x86 SIMD extension for IJG JPEG library,
1953 + * Copyright (C) 1999-2006, MIYASAKA Masaru.
1954 + * For conditions of distribution and use, see copyright notice in jsimdext.inc
1955 + *
1956 + * This file contains the interface between the "normal" portions
1957 + * of the library and the SIMD implementations when running on a
1958 + * 64-bit ARM architecture.
1959 + */
1960 +
1961 +#define JPEG_INTERNALS
1962 +#include "../jinclude.h"
1963 +#include "../jpeglib.h"
1964 +#include "../jsimd.h"
1965 +#include "../jdct.h"
1966 +#include "../jsimddct.h"
1967 +#include "jsimd.h"
1968 +
1969 +#include <stdio.h>
1970 +#include <string.h>
1971 +#include <ctype.h>
1972 +
1973 +static unsigned int simd_support = ~0;
1974 +
1975 +/*
1976 + * Check what SIMD accelerations are supported.
1977 + *
1978 + * FIXME: This code is racy under a multi-threaded environment.
1979 + */
1980 +
1981 +/*
1982 + * ARMv8 architectures support NEON extensions by default.
1983 + * It is no longer optional as it was with ARMv7.
1984 + */
1985 +
1986 +
1987 +LOCAL(void)
1988 +init_simd (void)
1989 +{
1990 + char *env = NULL;
1991 +
1992 + if (simd_support != ~0U)
1993 + return;
1994 +
1995 + simd_support = 0;
1996 +
1997 + simd_support |= JSIMD_ARM_NEON;
1998 +
1999 + /* Force different settings through environment variables */
2000 + env = getenv("JSIMD_FORCENEON");
2001 + if ((env != NULL) && (strcmp(env, "1") == 0))
2002 + simd_support &= JSIMD_ARM_NEON;
2003 + env = getenv("JSIMD_FORCENONE");
2004 + if ((env != NULL) && (strcmp(env, "1") == 0))
2005 + simd_support = 0;
2006 +}
2007 +
2008 +GLOBAL(int)
2009 +jsimd_can_rgb_ycc (void)
2010 +{
2011 + init_simd();
2012 +
2013 + return 0;
2014 +}
2015 +
2016 +GLOBAL(int)
2017 +jsimd_can_rgb_gray (void)
2018 +{
2019 + init_simd();
2020 +
2021 + return 0;
2022 +}
2023 +
2024 +GLOBAL(int)
2025 +jsimd_can_ycc_rgb (void)
2026 +{
2027 + init_simd();
2028 +
2029 + /* The code is optimised for these values only */
2030 + if (BITS_IN_JSAMPLE != 8)
2031 + return 0;
2032 + if (sizeof(JDIMENSION) != 4)
2033 + return 0;
2034 + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
2035 + return 0;
2036 +
2037 + if (simd_support & JSIMD_ARM_NEON)
2038 + return 1;
2039 +
2040 + return 0;
2041 +}
2042 +
2043 +GLOBAL(int)
2044 +jsimd_can_ycc_rgb565 (void)
2045 +{
2046 + init_simd();
2047 +
2048 + /* The code is optimised for these values only */
2049 + if (BITS_IN_JSAMPLE != 8)
2050 + return 0;
2051 + if (sizeof(JDIMENSION) != 4)
2052 + return 0;
2053 +
2054 + if (simd_support & JSIMD_ARM_NEON)
2055 + return 1;
2056 +
2057 + return 0;
2058 +}
2059 +
2060 +GLOBAL(void)
2061 +jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
2062 + JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
2063 + JDIMENSION output_row, int num_rows)
2064 +{
2065 +}
2066 +
2067 +GLOBAL(void)
2068 +jsimd_rgb_gray_convert (j_compress_ptr cinfo,
2069 + JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
2070 + JDIMENSION output_row, int num_rows)
2071 +{
2072 +}
2073 +
2074 +GLOBAL(void)
2075 +jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
2076 + JSAMPIMAGE input_buf, JDIMENSION input_row,
2077 + JSAMPARRAY output_buf, int num_rows)
2078 +{
2079 + void (*neonfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
2080 +
2081 + switch(cinfo->out_color_space) {
2082 + case JCS_EXT_RGB:
2083 + neonfct=jsimd_ycc_extrgb_convert_neon;
2084 + break;
2085 + case JCS_EXT_RGBX:
2086 + case JCS_EXT_RGBA:
2087 + neonfct=jsimd_ycc_extrgbx_convert_neon;
2088 + break;
2089 + case JCS_EXT_BGR:
2090 + neonfct=jsimd_ycc_extbgr_convert_neon;
2091 + break;
2092 + case JCS_EXT_BGRX:
2093 + case JCS_EXT_BGRA:
2094 + neonfct=jsimd_ycc_extbgrx_convert_neon;
2095 + break;
2096 + case JCS_EXT_XBGR:
2097 + case JCS_EXT_ABGR:
2098 + neonfct=jsimd_ycc_extxbgr_convert_neon;
2099 + break;
2100 + case JCS_EXT_XRGB:
2101 + case JCS_EXT_ARGB:
2102 + neonfct=jsimd_ycc_extxrgb_convert_neon;
2103 + break;
2104 + default:
2105 + neonfct=jsimd_ycc_extrgb_convert_neon;
2106 + break;
2107 + }
2108 +
2109 + if (simd_support & JSIMD_ARM_NEON)
2110 + neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
2111 +}
2112 +
2113 +GLOBAL(void)
2114 +jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
2115 + JSAMPIMAGE input_buf, JDIMENSION input_row,
2116 + JSAMPARRAY output_buf, int num_rows)
2117 +{
2118 + if (simd_support & JSIMD_ARM_NEON)
2119 + jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
2120 + output_buf, num_rows);
2121 +}
2122 +
2123 +GLOBAL(int)
2124 +jsimd_can_h2v2_downsample (void)
2125 +{
2126 + init_simd();
2127 +
2128 + return 0;
2129 +}
2130 +
2131 +GLOBAL(int)
2132 +jsimd_can_h2v1_downsample (void)
2133 +{
2134 + init_simd();
2135 +
2136 + return 0;
2137 +}
2138 +
2139 +GLOBAL(void)
2140 +jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
2141 + JSAMPARRAY input_data, JSAMPARRAY output_data)
2142 +{
2143 +}
2144 +
2145 +GLOBAL(void)
2146 +jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
2147 + JSAMPARRAY input_data, JSAMPARRAY output_data)
2148 +{
2149 +}
2150 +
2151 +GLOBAL(int)
2152 +jsimd_can_h2v2_upsample (void)
2153 +{
2154 + init_simd();
2155 +
2156 + return 0;
2157 +}
2158 +
2159 +GLOBAL(int)
2160 +jsimd_can_h2v1_upsample (void)
2161 +{
2162 + init_simd();
2163 +
2164 + return 0;
2165 +}
2166 +
2167 +GLOBAL(void)
2168 +jsimd_h2v2_upsample (j_decompress_ptr cinfo,
2169 + jpeg_component_info * compptr,
2170 + JSAMPARRAY input_data,
2171 + JSAMPARRAY * output_data_ptr)
2172 +{
2173 +}
2174 +
2175 +GLOBAL(void)
2176 +jsimd_h2v1_upsample (j_decompress_ptr cinfo,
2177 + jpeg_component_info * compptr,
2178 + JSAMPARRAY input_data,
2179 + JSAMPARRAY * output_data_ptr)
2180 +{
2181 +}
2182 +
2183 +GLOBAL(int)
2184 +jsimd_can_h2v2_fancy_upsample (void)
2185 +{
2186 + init_simd();
2187 +
2188 + return 0;
2189 +}
2190 +
2191 +GLOBAL(int)
2192 +jsimd_can_h2v1_fancy_upsample (void)
2193 +{
2194 + init_simd();
2195 +
2196 + return 0;
2197 +}
2198 +
2199 +GLOBAL(void)
2200 +jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
2201 + jpeg_component_info * compptr,
2202 + JSAMPARRAY input_data,
2203 + JSAMPARRAY * output_data_ptr)
2204 +{
2205 +}
2206 +
2207 +GLOBAL(void)
2208 +jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
2209 + jpeg_component_info * compptr,
2210 + JSAMPARRAY input_data,
2211 + JSAMPARRAY * output_data_ptr)
2212 +{
2213 +}
2214 +
2215 +GLOBAL(int)
2216 +jsimd_can_h2v2_merged_upsample (void)
2217 +{
2218 + init_simd();
2219 +
2220 + return 0;
2221 +}
2222 +
2223 +GLOBAL(int)
2224 +jsimd_can_h2v1_merged_upsample (void)
2225 +{
2226 + init_simd();
2227 +
2228 + return 0;
2229 +}
2230 +
2231 +GLOBAL(void)
2232 +jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
2233 + JSAMPIMAGE input_buf,
2234 + JDIMENSION in_row_group_ctr,
2235 + JSAMPARRAY output_buf)
2236 +{
2237 +}
2238 +
2239 +GLOBAL(void)
2240 +jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
2241 + JSAMPIMAGE input_buf,
2242 + JDIMENSION in_row_group_ctr,
2243 + JSAMPARRAY output_buf)
2244 +{
2245 +}
2246 +
2247 +GLOBAL(int)
2248 +jsimd_can_convsamp (void)
2249 +{
2250 + init_simd();
2251 +
2252 + return 0;
2253 +}
2254 +
2255 +GLOBAL(int)
2256 +jsimd_can_convsamp_float (void)
2257 +{
2258 + init_simd();
2259 +
2260 + return 0;
2261 +}
2262 +
2263 +GLOBAL(void)
2264 +jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
2265 + DCTELEM * workspace)
2266 +{
2267 +}
2268 +
2269 +GLOBAL(void)
2270 +jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
2271 + FAST_FLOAT * workspace)
2272 +{
2273 +}
2274 +
2275 +GLOBAL(int)
2276 +jsimd_can_fdct_islow (void)
2277 +{
2278 + init_simd();
2279 +
2280 + return 0;
2281 +}
2282 +
2283 +GLOBAL(int)
2284 +jsimd_can_fdct_ifast (void)
2285 +{
2286 + init_simd();
2287 +
2288 + return 0;
2289 +}
2290 +
2291 +GLOBAL(int)
2292 +jsimd_can_fdct_float (void)
2293 +{
2294 + init_simd();
2295 +
2296 + return 0;
2297 +}
2298 +
2299 +GLOBAL(void)
2300 +jsimd_fdct_islow (DCTELEM * data)
2301 +{
2302 +}
2303 +
2304 +GLOBAL(void)
2305 +jsimd_fdct_ifast (DCTELEM * data)
2306 +{
2307 +}
2308 +
2309 +GLOBAL(void)
2310 +jsimd_fdct_float (FAST_FLOAT * data)
2311 +{
2312 +}
2313 +
2314 +GLOBAL(int)
2315 +jsimd_can_quantize (void)
2316 +{
2317 + init_simd();
2318 +
2319 + return 0;
2320 +}
2321 +
2322 +GLOBAL(int)
2323 +jsimd_can_quantize_float (void)
2324 +{
2325 + init_simd();
2326 +
2327 + return 0;
2328 +}
2329 +
2330 +GLOBAL(void)
2331 +jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
2332 + DCTELEM * workspace)
2333 +{
2334 +}
2335 +
2336 +GLOBAL(void)
2337 +jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
2338 + FAST_FLOAT * workspace)
2339 +{
2340 +}
2341 +
2342 +GLOBAL(int)
2343 +jsimd_can_idct_2x2 (void)
2344 +{
2345 + init_simd();
2346 +
2347 + /* The code is optimised for these values only */
2348 + if (DCTSIZE != 8)
2349 + return 0;
2350 + if (sizeof(JCOEF) != 2)
2351 + return 0;
2352 + if (BITS_IN_JSAMPLE != 8)
2353 + return 0;
2354 + if (sizeof(JDIMENSION) != 4)
2355 + return 0;
2356 + if (sizeof(ISLOW_MULT_TYPE) != 2)
2357 + return 0;
2358 +
2359 + if (simd_support & JSIMD_ARM_NEON)
2360 + return 1;
2361 +
2362 + return 0;
2363 +}
2364 +
2365 +GLOBAL(int)
2366 +jsimd_can_idct_4x4 (void)
2367 +{
2368 + init_simd();
2369 +
2370 + /* The code is optimised for these values only */
2371 + if (DCTSIZE != 8)
2372 + return 0;
2373 + if (sizeof(JCOEF) != 2)
2374 + return 0;
2375 + if (BITS_IN_JSAMPLE != 8)
2376 + return 0;
2377 + if (sizeof(JDIMENSION) != 4)
2378 + return 0;
2379 + if (sizeof(ISLOW_MULT_TYPE) != 2)
2380 + return 0;
2381 +
2382 + if (simd_support & JSIMD_ARM_NEON)
2383 + return 1;
2384 +
2385 + return 0;
2386 +}
2387 +
2388 +GLOBAL(void)
2389 +jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2390 + JCOEFPTR coef_block, JSAMPARRAY output_buf,
2391 + JDIMENSION output_col)
2392 +{
2393 + if (simd_support & JSIMD_ARM_NEON)
2394 + jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf,
2395 + output_col);
2396 +}
2397 +
2398 +GLOBAL(void)
2399 +jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2400 + JCOEFPTR coef_block, JSAMPARRAY output_buf,
2401 + JDIMENSION output_col)
2402 +{
2403 + if (simd_support & JSIMD_ARM_NEON)
2404 + jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf,
2405 + output_col);
2406 +}
2407 +
2408 +GLOBAL(int)
2409 +jsimd_can_idct_islow (void)
2410 +{
2411 + init_simd();
2412 +
2413 + /* The code is optimised for these values only */
2414 + if (DCTSIZE != 8)
2415 + return 0;
2416 + if (sizeof(JCOEF) != 2)
2417 + return 0;
2418 + if (BITS_IN_JSAMPLE != 8)
2419 + return 0;
2420 + if (sizeof(JDIMENSION) != 4)
2421 + return 0;
2422 + if (sizeof(ISLOW_MULT_TYPE) != 2)
2423 + return 0;
2424 +
2425 + if (simd_support & JSIMD_ARM_NEON)
2426 + return 1;
2427 +
2428 + return 0;
2429 +}
2430 +
2431 +GLOBAL(int)
2432 +jsimd_can_idct_ifast (void)
2433 +{
2434 + init_simd();
2435 +
2436 + /* The code is optimised for these values only */
2437 + if (DCTSIZE != 8)
2438 + return 0;
2439 + if (sizeof(JCOEF) != 2)
2440 + return 0;
2441 + if (BITS_IN_JSAMPLE != 8)
2442 + return 0;
2443 + if (sizeof(JDIMENSION) != 4)
2444 + return 0;
2445 + if (sizeof(IFAST_MULT_TYPE) != 2)
2446 + return 0;
2447 + if (IFAST_SCALE_BITS != 2)
2448 + return 0;
2449 +
2450 + if (simd_support & JSIMD_ARM_NEON)
2451 + return 1;
2452 +
2453 + return 0;
2454 +}
2455 +
2456 +GLOBAL(int)
2457 +jsimd_can_idct_float (void)
2458 +{
2459 + init_simd();
2460 +
2461 + return 0;
2462 +}
2463 +
2464 +GLOBAL(void)
2465 +jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2466 + JCOEFPTR coef_block, JSAMPARRAY output_buf,
2467 + JDIMENSION output_col)
2468 +{
2469 + if (simd_support & JSIMD_ARM_NEON)
2470 + jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
2471 + output_col);
2472 +}
2473 +
2474 +GLOBAL(void)
2475 +jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2476 + JCOEFPTR coef_block, JSAMPARRAY output_buf,
2477 + JDIMENSION output_col)
2478 +{
2479 + if (simd_support & JSIMD_ARM_NEON)
2480 + jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
2481 + output_col);
2482 +}
2483 +
2484 +GLOBAL(void)
2485 +jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2486 + JCOEFPTR coef_block, JSAMPARRAY output_buf,
2487 + JDIMENSION output_col)
2488 +{
2489 +}
2490 Index: simd/jsimd_arm64_neon.S
2491 new file mode 100644
2492 ===================================================================
2493 --- /dev/null
2494 +++ simd/jsimd_arm64_neon.S
2495 @@ -0,0 +1,1861 @@
2496 +/*
2497 + * ARMv8 NEON optimizations for libjpeg-turbo
2498 + *
2499 + * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
2500 + * All rights reserved.
2501 + * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
2502 + * Copyright (C) 2013-2014, Linaro Limited
2503 + * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
2504 + *
2505 + * This software is provided 'as-is', without any express or implied
2506 + * warranty. In no event will the authors be held liable for any damages
2507 + * arising from the use of this software.
2508 + *
2509 + * Permission is granted to anyone to use this software for any purpose,
2510 + * including commercial applications, and to alter it and redistribute it
2511 + * freely, subject to the following restrictions:
2512 + *
2513 + * 1. The origin of this software must not be misrepresented; you must not
2514 + * claim that you wrote the original software. If you use this software
2515 + * in a product, an acknowledgment in the product documentation would be
2516 + * appreciated but is not required.
2517 + * 2. Altered source versions must be plainly marked as such, and must not be
2518 + * misrepresented as being the original software.
2519 + * 3. This notice may not be removed or altered from any source distribution.
2520 + */
2521 +
2522 +#if defined(__linux__) && defined(__ELF__)
2523 +.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
2524 +#endif
2525 +
2526 +.text
2527 +.arch armv8-a+fp+simd
2528 +
2529 +
2530 +#define RESPECT_STRICT_ALIGNMENT 1
2531 +
2532 +
2533 +/*****************************************************************************/
2534 +
2535 +/* Supplementary macro for setting function attributes */
2536 +.macro asm_function fname
2537 +#ifdef __APPLE__
2538 + .globl _\fname
2539 +_\fname:
2540 +#else
2541 + .global \fname
2542 +#ifdef __ELF__
2543 + .hidden \fname
2544 + .type \fname, %function
2545 +#endif
2546 +\fname:
2547 +#endif
2548 +.endm
2549 +
2550 +/* Transpose elements of single 128 bit registers */
2551 +.macro transpose_single x0,x1,xi,xilen,literal
2552 + ins \xi\xilen[0], \x0\xilen[0]
2553 + ins \x1\xilen[0], \x0\xilen[1]
2554 + trn1 \x0\literal, \x0\literal, \x1\literal
2555 + trn2 \x1\literal, \xi\literal, \x1\literal
2556 +.endm
2557 +
2558 +/* Transpose elements of 2 differnet registers */
2559 +.macro transpose x0,x1,xi,xilen,literal
2560 + mov \xi\xilen, \x0\xilen
2561 + trn1 \x0\literal, \x0\literal, \x1\literal
2562 + trn2 \x1\literal, \xi\literal, \x1\literal
2563 +.endm
2564 +
2565 +/* Transpose a block of 4x4 coefficients in four 64-bit registers */
2566 +.macro transpose_4x4_32 x0,x0len x1,x1len x2,x2len x3,x3len,xi,xilen
2567 + mov \xi\xilen, \x0\xilen
2568 + trn1 \x0\x0len, \x0\x0len, \x2\x2len
2569 + trn2 \x2\x2len, \xi\x0len, \x2\x2len
2570 + mov \xi\xilen, \x1\xilen
2571 + trn1 \x1\x1len, \x1\x1len, \x3\x3len
2572 + trn2 \x3\x3len, \xi\x1len, \x3\x3len
2573 +.endm
2574 +
2575 +.macro transpose_4x4_16 x0,x0len x1,x1len, x2,x2len, x3,x3len,xi,xilen
2576 + mov \xi\xilen, \x0\xilen
2577 + trn1 \x0\x0len, \x0\x0len, \x1\x1len
2578 + trn2 \x1\x2len, \xi\x0len, \x1\x2len
2579 + mov \xi\xilen, \x2\xilen
2580 + trn1 \x2\x2len, \x2\x2len, \x3\x3len
2581 + trn2 \x3\x2len, \xi\x1len, \x3\x3len
2582 +.endm
2583 +
2584 +.macro transpose_4x4 x0, x1, x2, x3,x5
2585 + transpose_4x4_16 \x0,.4h, \x1,.4h, \x2,.4h,\x3,.4h,\x5,.16b
2586 + transpose_4x4_32 \x0,.2s, \x1,.2s, \x2,.2s,\x3,.2s,\x5,.16b
2587 +.endm
2588 +
2589 +
2590 +#define CENTERJSAMPLE 128
2591 +
2592 +/*****************************************************************************/
2593 +
2594 +/*
2595 + * Perform dequantization and inverse DCT on one block of coefficients.
2596 + *
2597 + * GLOBAL(void)
2598 + * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block,
2599 + * JSAMPARRAY output_buf, JDIMENSION output_col)
2600 + */
2601 +
2602 +#define FIX_0_298631336 (2446)
2603 +#define FIX_0_390180644 (3196)
2604 +#define FIX_0_541196100 (4433)
2605 +#define FIX_0_765366865 (6270)
2606 +#define FIX_0_899976223 (7373)
2607 +#define FIX_1_175875602 (9633)
2608 +#define FIX_1_501321110 (12299)
2609 +#define FIX_1_847759065 (15137)
2610 +#define FIX_1_961570560 (16069)
2611 +#define FIX_2_053119869 (16819)
2612 +#define FIX_2_562915447 (20995)
2613 +#define FIX_3_072711026 (25172)
2614 +
2615 +#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
2616 +#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
2617 +#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
2618 +#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
2619 +#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
2620 +#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
2621 +#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
2622 +#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865)
2623 +
2624 +/*
2625 + * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
2626 + * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
2627 + */
2628 +#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \
2629 +{ \
2630 + DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
2631 + INT32 q1, q2, q3, q4, q5, q6, q7; \
2632 + INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \
2633 + \
2634 + /* 1-D iDCT input data */ \
2635 + row0 = xrow0; \
2636 + row1 = xrow1; \
2637 + row2 = xrow2; \
2638 + row3 = xrow3; \
2639 + row4 = xrow4; \
2640 + row5 = xrow5; \
2641 + row6 = xrow6; \
2642 + row7 = xrow7; \
2643 + \
2644 + q5 = row7 + row3; \
2645 + q4 = row5 + row1; \
2646 + q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
2647 + MULTIPLY(q4, FIX_1_175875602); \
2648 + q7 = MULTIPLY(q5, FIX_1_175875602) + \
2649 + MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
2650 + q2 = MULTIPLY(row2, FIX_0_541196100) + \
2651 + MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
2652 + q4 = q6; \
2653 + q3 = ((INT32) row0 - (INT32) row4) << 13; \
2654 + q6 += MULTIPLY(row5, -FIX_2_562915447) + \
2655 + MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
2656 + /* now we can use q1 (reloadable constants have been used up) */ \
2657 + q1 = q3 + q2; \
2658 + q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
2659 + MULTIPLY(row1, -FIX_0_899976223); \
2660 + q5 = q7; \
2661 + q1 = q1 + q6; \
2662 + q7 += MULTIPLY(row7, -FIX_0_899976223) + \
2663 + MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
2664 + \
2665 + /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
2666 + tmp11_plus_tmp2 = q1; \
2667 + row1 = 0; \
2668 + \
2669 + q1 = q1 - q6; \
2670 + q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
2671 + MULTIPLY(row3, -FIX_2_562915447); \
2672 + q1 = q1 - q6; \
2673 + q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
2674 + MULTIPLY(row6, FIX_0_541196100); \
2675 + q3 = q3 - q2; \
2676 + \
2677 + /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
2678 + tmp11_minus_tmp2 = q1; \
2679 + \
2680 + q1 = ((INT32) row0 + (INT32) row4) << 13; \
2681 + q2 = q1 + q6; \
2682 + q1 = q1 - q6; \
2683 + \
2684 + /* pick up the results */ \
2685 + tmp0 = q4; \
2686 + tmp1 = q5; \
2687 + tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
2688 + tmp3 = q7; \
2689 + tmp10 = q2; \
2690 + tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
2691 + tmp12 = q3; \
2692 + tmp13 = q1; \
2693 +}
2694 +
2695 +#define XFIX_0_899976223 v0.4h[0]
2696 +#define XFIX_0_541196100 v0.4h[1]
2697 +#define XFIX_2_562915447 v0.4h[2]
2698 +#define XFIX_0_298631336_MINUS_0_899976223 v0.4h[3]
2699 +#define XFIX_1_501321110_MINUS_0_899976223 v1.4h[0]
2700 +#define XFIX_2_053119869_MINUS_2_562915447 v1.4h[1]
2701 +#define XFIX_0_541196100_PLUS_0_765366865 v1.4h[2]
2702 +#define XFIX_1_175875602 v1.4h[3]
2703 +#define XFIX_1_175875602_MINUS_0_390180644 v2.4h[0]
2704 +#define XFIX_0_541196100_MINUS_1_847759065 v2.4h[1]
2705 +#define XFIX_3_072711026_MINUS_2_562915447 v2.4h[2]
2706 +#define XFIX_1_175875602_MINUS_1_961570560 v2.4h[3]
2707 +
2708 +.balign 16
2709 +jsimd_idct_islow_neon_consts:
2710 + .short FIX_0_899976223 /* d0[0] */
2711 + .short FIX_0_541196100 /* d0[1] */
2712 + .short FIX_2_562915447 /* d0[2] */
2713 + .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */
2714 + .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */
2715 + .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */
2716 + .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */
2717 + .short FIX_1_175875602 /* d1[3] */
2718 + /* reloadable constants */
2719 + .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */
2720 + .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */
2721 + .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */
2722 + .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */
2723 +
2724 +asm_function jsimd_idct_islow_neon
2725 +
2726 + DCT_TABLE .req x0
2727 + COEF_BLOCK .req x1
2728 + OUTPUT_BUF .req x2
2729 + OUTPUT_COL .req x3
2730 + TMP1 .req x0
2731 + TMP2 .req x1
2732 + TMP3 .req x2
2733 + TMP4 .req x15
2734 +
2735 + ROW0L .req v16
2736 + ROW0R .req v17
2737 + ROW1L .req v18
2738 + ROW1R .req v19
2739 + ROW2L .req v20
2740 + ROW2R .req v21
2741 + ROW3L .req v22
2742 + ROW3R .req v23
2743 + ROW4L .req v24
2744 + ROW4R .req v25
2745 + ROW5L .req v26
2746 + ROW5R .req v27
2747 + ROW6L .req v28
2748 + ROW6R .req v29
2749 + ROW7L .req v30
2750 + ROW7R .req v31
2751 + /* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */
2752 + sub sp, sp, 272
2753 + str x15, [sp], 16
2754 + adr x15, jsimd_idct_islow_neon_consts
2755 + st1 {v0.8b - v3.8b}, [sp], 32
2756 + st1 {v4.8b - v7.8b}, [sp], 32
2757 + st1 {v8.8b - v11.8b}, [sp], 32
2758 + st1 {v12.8b - v15.8b}, [sp], 32
2759 + st1 {v16.8b - v19.8b}, [sp], 32
2760 + st1 {v20.8b - v23.8b}, [sp], 32
2761 + st1 {v24.8b - v27.8b}, [sp], 32
2762 + st1 {v28.8b - v31.8b}, [sp], 32
2763 + ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32
2764 + ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
2765 + ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32
2766 + mul v16.4h, v16.4h, v0.4h
2767 + mul v17.4h, v17.4h, v1.4h
2768 + ins v16.2d[1], v17.2d[0] /* 128 bit q8 */
2769 + ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
2770 + mul v18.4h, v18.4h, v2.4h
2771 + mul v19.4h, v19.4h, v3.4h
2772 + ins v18.2d[1], v19.2d[0] /* 128 bit q9 */
2773 + ld1 {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32
2774 + mul v20.4h, v20.4h, v4.4h
2775 + mul v21.4h, v21.4h, v5.4h
2776 + ins v20.2d[1], v21.2d[0] /* 128 bit q10 */
2777 + ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
2778 + mul v22.4h, v22.4h, v6.4h
2779 + mul v23.4h, v23.4h, v7.4h
2780 + ins v22.2d[1], v23.2d[0] /* 128 bit q11 */
2781 + ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK]
2782 + mul v24.4h, v24.4h, v0.4h
2783 + mul v25.4h, v25.4h, v1.4h
2784 + ins v24.2d[1], v25.2d[0] /* 128 bit q12 */
2785 + ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
2786 + mul v28.4h, v28.4h, v4.4h
2787 + mul v29.4h, v29.4h, v5.4h
2788 + ins v28.2d[1], v29.2d[0] /* 128 bit q14 */
2789 + mul v26.4h, v26.4h, v2.4h
2790 + mul v27.4h, v27.4h, v3.4h
2791 + ins v26.2d[1], v27.2d[0] /* 128 bit q13 */
2792 + ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x15] /* load constants */
2793 + add x15, x15, #16
2794 + mul v30.4h, v30.4h, v6.4h
2795 + mul v31.4h, v31.4h, v7.4h
2796 + ins v30.2d[1], v31.2d[0] /* 128 bit q15 */
2797 + /* Go to the bottom of the stack */
2798 + sub sp, sp, 352
2799 + stp x4, x5, [sp], 16
2800 + st1 {v8.4h - v11.4h}, [sp], 32 /* save NEON registers */
2801 + st1 {v12.4h - v15.4h}, [sp], 32
2802 + /* 1-D IDCT, pass 1, left 4x8 half */
2803 + add v4.4h, ROW7L.4h, ROW3L.4h
2804 + add v5.4h, ROW5L.4h, ROW1L.4h
2805 + smull v12.4s, v4.4h, XFIX_1_175875602_MINUS_1_961570560
2806 + smlal v12.4s, v5.4h, XFIX_1_175875602
2807 + smull v14.4s, v4.4h, XFIX_1_175875602
2808 + /* Check for the zero coefficients in the right 4x8 half */
2809 + smlal v14.4s, v5.4h, XFIX_1_175875602_MINUS_0_390180644
2810 + ssubl v6.4s, ROW0L.4h, ROW4L.4h
2811 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
2812 + smull v4.4s, ROW2L.4h, XFIX_0_541196100
2813 + smlal v4.4s, ROW6L.4h, XFIX_0_541196100_MINUS_1_847759065
2814 + orr x0, x4, x5
2815 + mov v8.16b, v12.16b
2816 + smlsl v12.4s, ROW5L.4h, XFIX_2_562915447
2817 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
2818 + smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
2819 + shl v6.4s, v6.4s, #13
2820 + orr x0, x0, x4
2821 + smlsl v8.4s, ROW1L.4h, XFIX_0_899976223
2822 + orr x0, x0 , x5
2823 + add v2.4s, v6.4s, v4.4s
2824 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
2825 + mov v10.16b, v14.16b
2826 + add v2.4s, v2.4s, v12.4s
2827 + orr x0, x0, x4
2828 + smlsl v14.4s, ROW7L.4h, XFIX_0_899976223
2829 + orr x0, x0, x5
2830 + smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
2831 + rshrn ROW1L.4h, v2.4s, #11
2832 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
2833 + sub v2.4s, v2.4s, v12.4s
2834 + smlal v10.4s, ROW5L.4h, XFIX_2_053119869_MINUS_2_562915447
2835 + orr x0, x0, x4
2836 + smlsl v10.4s, ROW3L.4h, XFIX_2_562915447
2837 + orr x0, x0, x5
2838 + sub v2.4s, v2.4s, v12.4s
2839 + smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
2840 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
2841 + smlal v12.4s, ROW6L.4h, XFIX_0_541196100
2842 + sub v6.4s, v6.4s, v4.4s
2843 + orr x0, x0, x4
2844 + rshrn ROW6L.4h, v2.4s, #11
2845 + orr x0, x0, x5
2846 + add v2.4s, v6.4s, v10.4s
2847 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
2848 + sub v6.4s, v6.4s, v10.4s
2849 + saddl v10.4s, ROW0L.4h, ROW4L.4h
2850 + orr x0, x0, x4
2851 + rshrn ROW2L.4h, v2.4s, #11
2852 + orr x0, x0, x5
2853 + rshrn ROW5L.4h, v6.4s, #11
2854 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
2855 + shl v10.4s, v10.4s, #13
2856 + smlal v8.4s, ROW7L.4h, XFIX_0_298631336_MINUS_0_899976223
2857 + orr x0, x0, x4
2858 + add v4.4s, v10.4s, v12.4s
2859 + orr x0, x0, x5
2860 + cmp x0, #0 /* orrs instruction removed */
2861 + sub v2.4s, v10.4s, v12.4s
2862 + add v12.4s, v4.4s, v14.4s
2863 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
2864 + sub v4.4s, v4.4s, v14.4s
2865 + add v10.4s, v2.4s, v8.4s
2866 + orr x0, x4, x5
2867 + sub v6.4s, v2.4s, v8.4s
2868 + /* pop {x4, x5} */
2869 + sub sp, sp, 80
2870 + ldp x4, x5, [sp], 16
2871 + rshrn ROW7L.4h, v4.4s, #11
2872 + rshrn ROW3L.4h, v10.4s, #11
2873 + rshrn ROW0L.4h, v12.4s, #11
2874 + rshrn ROW4L.4h, v6.4s, #11
2875 +
2876 + beq 3f /* Go to do some special handling for the sparse right 4x8 half */
2877 +
2878 + /* 1-D IDCT, pass 1, right 4x8 half */
2879 + ld1 {v2.4h}, [x15] /* reload constants */
2880 + add v10.4h, ROW7R.4h, ROW3R.4h
2881 + add v8.4h, ROW5R.4h, ROW1R.4h
2882 + /* Transpose ROW6L <-> ROW7L (v3 available free register) */
2883 + transpose ROW6L, ROW7L, v3, .16b, .4h
2884 + smull v12.4s, v10.4h, XFIX_1_175875602_MINUS_1_961570560
2885 + smlal v12.4s, v8.4h, XFIX_1_175875602
2886 + /* Transpose ROW2L <-> ROW3L (v3 available free register) */
2887 + transpose ROW2L, ROW3L, v3, .16b, .4h
2888 + smull v14.4s, v10.4h, XFIX_1_175875602
2889 + smlal v14.4s, v8.4h, XFIX_1_175875602_MINUS_0_390180644
2890 + /* Transpose ROW0L <-> ROW1L (v3 available free register) */
2891 + transpose ROW0L, ROW1L, v3, .16b, .4h
2892 + ssubl v6.4s, ROW0R.4h, ROW4R.4h
2893 + smull v4.4s, ROW2R.4h, XFIX_0_541196100
2894 + smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065
2895 + /* Transpose ROW4L <-> ROW5L (v3 available free register) */
2896 + transpose ROW4L, ROW5L, v3, .16b, .4h
2897 + mov v8.16b, v12.16b
2898 + smlsl v12.4s, ROW5R.4h, XFIX_2_562915447
2899 + smlal v12.4s, ROW3R.4h, XFIX_3_072711026_MINUS_2_562915447
2900 + /* Transpose ROW1L <-> ROW3L (v3 available free register) */
2901 + transpose ROW1L, ROW3L, v3, .16b, .2s
2902 + shl v6.4s, v6.4s, #13
2903 + smlsl v8.4s, ROW1R.4h, XFIX_0_899976223
2904 + /* Transpose ROW4L <-> ROW6L (v3 available free register) */
2905 + transpose ROW4L, ROW6L, v3, .16b, .2s
2906 + add v2.4s, v6.4s, v4.4s
2907 + mov v10.16b, v14.16b
2908 + add v2.4s, v2.4s, v12.4s
2909 + /* Transpose ROW0L <-> ROW2L (v3 available free register) */
2910 + transpose ROW0L, ROW2L, v3, .16b, .2s
2911 + smlsl v14.4s, ROW7R.4h, XFIX_0_899976223
2912 + smlal v14.4s, ROW1R.4h, XFIX_1_501321110_MINUS_0_899976223
2913 + rshrn ROW1R.4h, v2.4s, #11
2914 + /* Transpose ROW5L <-> ROW7L (v3 available free register) */
2915 + transpose ROW5L, ROW7L, v3, .16b, .2s
2916 + sub v2.4s, v2.4s, v12.4s
2917 + smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447
2918 + smlsl v10.4s, ROW3R.4h, XFIX_2_562915447
2919 + sub v2.4s, v2.4s, v12.4s
2920 + smull v12.4s, ROW2R.4h, XFIX_0_541196100_PLUS_0_765366865
2921 + smlal v12.4s, ROW6R.4h, XFIX_0_541196100
2922 + sub v6.4s, v6.4s, v4.4s
2923 + rshrn ROW6R.4h, v2.4s, #11
2924 + add v2.4s, v6.4s, v10.4s
2925 + sub v6.4s, v6.4s, v10.4s
2926 + saddl v10.4s, ROW0R.4h, ROW4R.4h
2927 + rshrn ROW2R.4h, v2.4s, #11
2928 + rshrn ROW5R.4h, v6.4s, #11
2929 + shl v10.4s, v10.4s, #13
2930 + smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223
2931 + add v4.4s, v10.4s, v12.4s
2932 + sub v2.4s, v10.4s, v12.4s
2933 + add v12.4s, v4.4s, v14.4s
2934 + sub v4.4s, v4.4s, v14.4s
2935 + add v10.4s, v2.4s, v8.4s
2936 + sub v6.4s, v2.4s, v8.4s
2937 + rshrn ROW7R.4h, v4.4s, #11
2938 + rshrn ROW3R.4h, v10.4s, #11
2939 + rshrn ROW0R.4h, v12.4s, #11
2940 + rshrn ROW4R.4h, v6.4s, #11
2941 + /* Transpose right 4x8 half */
2942 + transpose ROW6R, ROW7R, v3, .16b, .4h
2943 + transpose ROW2R, ROW3R, v3, .16b, .4h
2944 + transpose ROW0R, ROW1R, v3, .16b, .4h
2945 + transpose ROW4R, ROW5R, v3, .16b, .4h
2946 + transpose ROW1R, ROW3R, v3, .16b, .2s
2947 + transpose ROW4R, ROW6R, v3, .16b, .2s
2948 + transpose ROW0R, ROW2R, v3, .16b, .2s
2949 + transpose ROW5R, ROW7R, v3, .16b, .2s
2950 +
2951 +1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
2952 + ld1 {v2.4h}, [x15] /* reload constants */
2953 + smull v12.4S, ROW1R.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R. 4h */
2954 + smlal v12.4s, ROW1L.4h, XFIX_1_175875602
2955 + smlal v12.4s, ROW3R.4h, XFIX_1_175875602_MINUS_1_961570560 /* R OW7L.4h <-> ROW3R.4h */
2956 + smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560
2957 + smull v14.4s, ROW3R.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R. 4h */
2958 + smlal v14.4s, ROW3L.4h, XFIX_1_175875602
2959 + smlal v14.4s, ROW1R.4h, XFIX_1_175875602_MINUS_0_390180644 /* R OW5L.4h <-> ROW1R.4h */
2960 + smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644
2961 + ssubl v6.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */
2962 + smull v4.4s, ROW2L.4h, XFIX_0_541196100
2963 + smlal v4.4s, ROW2R.4h, XFIX_0_541196100_MINUS_1_847759065 /* R OW6L.4h <-> ROW2R.4h */
2964 + mov v8.16b, v12.16b
2965 + smlsl v12.4s, ROW1R.4h, XFIX_2_562915447 /* ROW5L.4h <-> ROW1R. 4h */
2966 + smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
2967 + shl v6.4s, v6.4s, #13
2968 + smlsl v8.4s, ROW1L.4h, XFIX_0_899976223
2969 + add v2.4s, v6.4s, v4.4s
2970 + mov v10.16b, v14.16b
2971 + add v2.4s, v2.4s, v12.4s
2972 + smlsl v14.4s, ROW3R.4h, XFIX_0_899976223 /* ROW7L.4h <-> ROW3R. 4h */
2973 + smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
2974 + shrn ROW1L.4h, v2.4s, #16
2975 + sub v2.4s, v2.4s, v12.4s
2976 + smlal v10.4s, ROW1R.4h, XFIX_2_053119869_MINUS_2_562915447 /* R OW5L.4h <-> ROW1R.4h */
2977 + smlsl v10.4s, ROW3L.4h, XFIX_2_562915447
2978 + sub v2.4s, v2.4s, v12.4s
2979 + smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
2980 + smlal v12.4s, ROW2R.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R. 4h */
2981 + sub v6.4s, v6.4s, v4.4s
2982 + shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */
2983 + add v2.4s, v6.4s, v10.4s
2984 + sub v6.4s, v6.4s, v10.4s
2985 + saddl v10.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */
2986 + shrn ROW2L.4h, v2.4s, #16
2987 + shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */
2988 + shl v10.4s, v10.4s, #13
2989 + smlal v8.4s, ROW3R.4h, XFIX_0_298631336_MINUS_0_899976223 /* R OW7L.4h <-> ROW3R.4h */
2990 + add v4.4s, v10.4s, v12.4s
2991 + sub v2.4s, v10.4s, v12.4s
2992 + add v12.4s, v4.4s, v14.4s
2993 + sub v4.4s, v4.4s, v14.4s
2994 + add v10.4s, v2.4s, v8.4s
2995 + sub v6.4s, v2.4s, v8.4s
2996 + shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */
2997 + shrn ROW3L.4h, v10.4s, #16
2998 + shrn ROW0L.4h, v12.4s, #16
2999 + shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */
3000 + /* 1-D IDCT, pass 2, right 4x8 half */
3001 + ld1 {v2.4h}, [x15] /* reload constants */
3002 + smull v12.4s, ROW5R.4h, XFIX_1_175875602
3003 + smlal v12.4s, ROW5L.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R. 4h */
3004 + smlal v12.4s, ROW7R.4h, XFIX_1_175875602_MINUS_1_961570560
3005 + smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560 /* R OW7L.4h <-> ROW3R.4h */
3006 + smull v14.4s, ROW7R.4h, XFIX_1_175875602
3007 + smlal v14.4s, ROW7L.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R. 4h */
3008 + smlal v14.4s, ROW5R.4h, XFIX_1_175875602_MINUS_0_390180644
3009 + smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644 /* R OW5L.4h <-> ROW1R.4h */
3010 + ssubl v6.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */
3011 + smull v4.4s, ROW6L.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R. 4h */
3012 + smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065
3013 + mov v8.16b, v12.16b
3014 + smlsl v12.4s, ROW5R.4h, XFIX_2_562915447
3015 + smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447 /* R OW7L.4h <-> ROW3R.4h */
3016 + shl v6.4s, v6.4s, #13
3017 + smlsl v8.4s, ROW5L.4h, XFIX_0_899976223 /* ROW5L.4h <-> ROW1R. 4h */
3018 + add v2.4s, v6.4s, v4.4s
3019 + mov v10.16b, v14.16b
3020 + add v2.4s, v2.4s, v12.4s
3021 + smlsl v14.4s, ROW7R.4h, XFIX_0_899976223
3022 + smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223 /* R OW5L.4h <-> ROW1R.4h */
3023 + shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */
3024 + sub v2.4s, v2.4s, v12.4s
3025 + smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447
3026 + smlsl v10.4s, ROW7L.4h, XFIX_2_562915447 /* ROW7L.4h <-> ROW3R. 4h */
3027 + sub v2.4s, v2.4s, v12.4s
3028 + smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865 /* RO W6L.4h <-> ROW2R.4h */
3029 + smlal v12.4s, ROW6R.4h, XFIX_0_541196100
3030 + sub v6.4s, v6.4s, v4.4s
3031 + shrn ROW6R.4h, v2.4s, #16
3032 + add v2.4s, v6.4s, v10.4s
3033 + sub v6.4s, v6.4s, v10.4s
3034 + saddl v10.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */
3035 + shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */
3036 + shrn ROW5R.4h, v6.4s, #16
3037 + shl v10.4s, v10.4s, #13
3038 + smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223
3039 + add v4.4s, v10.4s, v12.4s
3040 + sub v2.4s, v10.4s, v12.4s
3041 + add v12.4s, v4.4s, v14.4s
3042 + sub v4.4s, v4.4s, v14.4s
3043 + add v10.4s, v2.4s, v8.4s
3044 + sub v6.4s, v2.4s, v8.4s
3045 + shrn ROW7R.4h, v4.4s, #16
3046 + shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */
3047 + shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */
3048 + shrn ROW4R.4h, v6.4s, #16
3049 +
3050 +2: /* Descale to 8-bit and range limit */
3051 + ins v16.2d[1], v17.2d[0]
3052 + ins v18.2d[1], v19.2d[0]
3053 + ins v20.2d[1], v21.2d[0]
3054 + ins v22.2d[1], v23.2d[0]
3055 + sqrshrn v16.8b, v16.8h, #2
3056 + sqrshrn2 v16.16b, v18.8h, #2
3057 + sqrshrn v18.8b, v20.8h, #2
3058 + sqrshrn2 v18.16b, v22.8h, #2
3059 +
3060 + /* vpop {v8.4h - d15.4h} */ /* restore NEON registers */
3061 + ld1 {v8.4h - v11.4h}, [sp], 32
3062 + ld1 {v12.4h - v15.4h}, [sp], 32
3063 + ins v24.2d[1], v25.2d[0]
3064 +
3065 + sqrshrn v20.8b, v24.8h, #2
3066 + /* Transpose the final 8-bit samples and do signed->unsigned conversion * /
3067 + /* trn1 v16.8h, v16.8h, v18.8h */
3068 + transpose v16, v18, v3, .16b, .8h
3069 + ins v26.2d[1], v27.2d[0]
3070 + ins v28.2d[1], v29.2d[0]
3071 + ins v30.2d[1], v31.2d[0]
3072 + sqrshrn2 v20.16b, v26.8h, #2
3073 + sqrshrn v22.8b, v28.8h, #2
3074 + movi v0.16b, #(CENTERJSAMPLE)
3075 + sqrshrn2 v22.16b, v30.8h, #2
3076 + transpose_single v16, v17, v3, .2d, .8b
3077 + transpose_single v18, v19, v3, .2d, .8b
3078 + add v16.8b, v16.8b, v0.8b
3079 + add v17.8b, v17.8b, v0.8b
3080 + add v18.8b, v18.8b, v0.8b
3081 + add v19.8b, v19.8b, v0.8b
3082 + transpose v20, v22, v3, .16b, .8h
3083 + /* Store results to the output buffer */
3084 + ldp TMP1, TMP2, [OUTPUT_BUF], 16
3085 + add TMP1, TMP1, OUTPUT_COL
3086 + add TMP2, TMP2, OUTPUT_COL
3087 + st1 {v16.8b}, [TMP1]
3088 + transpose_single v20, v21, v3, .2d, .8b
3089 + st1 {v17.8b}, [TMP2]
3090 + ldp TMP1, TMP2, [OUTPUT_BUF], 16
3091 + add TMP1, TMP1, OUTPUT_COL
3092 + add TMP2, TMP2, OUTPUT_COL
3093 + st1 {v18.8b}, [TMP1]
3094 + add v20.8b, v20.8b, v0.8b
3095 + add v21.8b, v21.8b, v0.8b
3096 + st1 {v19.8b}, [TMP2]
3097 + ldp TMP1, TMP2, [OUTPUT_BUF], 16
3098 + ldp TMP3, TMP4, [OUTPUT_BUF]
3099 + add TMP1, TMP1, OUTPUT_COL
3100 + add TMP2, TMP2, OUTPUT_COL
3101 + add TMP3, TMP3, OUTPUT_COL
3102 + add TMP4, TMP4, OUTPUT_COL
3103 + transpose_single v22, v23, v3, .2d, .8b
3104 + st1 {v20.8b}, [TMP1]
3105 + add v22.8b, v22.8b, v0.8b
3106 + add v23.8b, v23.8b, v0.8b
3107 + st1 {v21.8b}, [TMP2]
3108 + st1 {v22.8b}, [TMP3]
3109 + st1 {v23.8b}, [TMP4]
3110 + ldr x15, [sp], 16
3111 + ld1 {v0.8b - v3.8b}, [sp], 32
3112 + ld1 {v4.8b - v7.8b}, [sp], 32
3113 + ld1 {v8.8b - v11.8b}, [sp], 32
3114 + ld1 {v12.8b - v15.8b}, [sp], 32
3115 + ld1 {v16.8b - v19.8b}, [sp], 32
3116 + ld1 {v20.8b - v23.8b}, [sp], 32
3117 + ld1 {v24.8b - v27.8b}, [sp], 32
3118 + ld1 {v28.8b - v31.8b}, [sp], 32
3119 + blr x30
3120 +
3121 +3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
3122 +
3123 + /* Transpose left 4x8 half */
3124 + transpose ROW6L, ROW7L, v3, .16b, .4h
3125 + transpose ROW2L, ROW3L, v3, .16b, .4h
3126 + transpose ROW0L, ROW1L, v3, .16b, .4h
3127 + transpose ROW4L, ROW5L, v3, .16b, .4h
3128 + shl ROW0R.4h, ROW0R.4h, #2 /* PASS1_BITS */
3129 + transpose ROW1L, ROW3L, v3, .16b, .2s
3130 + transpose ROW4L, ROW6L, v3, .16b, .2s
3131 + transpose ROW0L, ROW2L, v3, .16b, .2s
3132 + transpose ROW5L, ROW7L, v3, .16b, .2s
3133 + cmp x0, #0
3134 + beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second p ass */
3135 +
3136 + /* Only row 0 is non-zero for the right 4x8 half */
3137 + dup ROW1R.4h, ROW0R.4h[1]
3138 + dup ROW2R.4h, ROW0R.4h[2]
3139 + dup ROW3R.4h, ROW0R.4h[3]
3140 + dup ROW4R.4h, ROW0R.4h[0]
3141 + dup ROW5R.4h, ROW0R.4h[1]
3142 + dup ROW6R.4h, ROW0R.4h[2]
3143 + dup ROW7R.4h, ROW0R.4h[3]
3144 + dup ROW0R.4h, ROW0R.4h[0]
3145 + b 1b /* Go to 'normal' second pass */
3146 +
3147 +4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
3148 + ld1 {v2.4h}, [x15] /* reload constants */
3149 + smull v12.4s, ROW1L.4h, XFIX_1_175875602
3150 + smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560
3151 + smull v14.4s, ROW3L.4h, XFIX_1_175875602
3152 + smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644
3153 + smull v4.4s, ROW2L.4h, XFIX_0_541196100
3154 + sshll v6.4s, ROW0L.4h, #13
3155 + mov v8.16b, v12.16b
3156 + smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
3157 + smlsl v8.4s, ROW1L.4h, XFIX_0_899976223
3158 + add v2.4s, v6.4s, v4.4s
3159 + mov v10.16b, v14.16b
3160 + smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
3161 + add v2.4s, v2.4s, v12.4s
3162 + add v12.4s, v12.4s, v12.4s
3163 + smlsl v10.4s, ROW3L.4h, XFIX_2_562915447
3164 + shrn ROW1L.4h, v2.4s, #16
3165 + sub v2.4s, v2.4s, v12.4s
3166 + smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
3167 + sub v6.4s, v6.4s, v4.4s
3168 + shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */
3169 + add v2.4s, v6.4s, v10.4s
3170 + sub v6.4s, v6.4s, v10.4s
3171 + sshll v10.4s, ROW0L.4h, #13
3172 + shrn ROW2L.4h, v2.4s, #16
3173 + shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */
3174 + add v4.4s, v10.4s, v12.4s
3175 + sub v2.4s, v10.4s, v12.4s
3176 + add v12.4s, v4.4s, v14.4s
3177 + sub v4.4s, v4.4s, v14.4s
3178 + add v10.4s, v2.4s, v8.4s
3179 + sub v6.4s, v2.4s, v8.4s
3180 + shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */
3181 + shrn ROW3L.4h, v10.4s, #16
3182 + shrn ROW0L.4h, v12.4s, #16
3183 + shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */
3184 + /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
3185 + ld1 {v2.4h}, [x15] /* reload constants */
3186 + smull v12.4s, ROW5L.4h, XFIX_1_175875602
3187 + smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560
3188 + smull v14.4s, ROW7L.4h, XFIX_1_175875602
3189 + smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644
3190 + smull v4.4s, ROW6L.4h, XFIX_0_541196100
3191 + sshll v6.4s, ROW4L.4h, #13
3192 + mov v8.16b, v12.16b
3193 + smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447
3194 + smlsl v8.4s, ROW5L.4h, XFIX_0_899976223
3195 + add v2.4s, v6.4s, v4.4s
3196 + mov v10.16b, v14.16b
3197 + smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223
3198 + add v2.4s, v2.4s, v12.4s
3199 + add v12.4s, v12.4s, v12.4s
3200 + smlsl v10.4s, ROW7L.4h, XFIX_2_562915447
3201 + shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */
3202 + sub v2.4s, v2.4s, v12.4s
3203 + smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865
3204 + sub v6.4s, v6.4s, v4.4s
3205 + shrn ROW6R.4h, v2.4s, #16
3206 + add v2.4s, v6.4s, v10.4s
3207 + sub v6.4s, v6.4s, v10.4s
3208 + sshll v10.4s, ROW4L.4h, #13
3209 + shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */
3210 + shrn ROW5R.4h, v6.4s, #16
3211 + add v4.4s, v10.4s, v12.4s
3212 + sub v2.4s, v10.4s, v12.4s
3213 + add v12.4s, v4.4s, v14.4s
3214 + sub v4.4s, v4.4s, v14.4s
3215 + add v10.4s, v2.4s, v8.4s
3216 + sub v6.4s, v2.4s, v8.4s
3217 + shrn ROW7R.4h, v4.4s, #16
3218 + shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */
3219 + shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */
3220 + shrn ROW4R.4h, v6.4s, #16
3221 + b 2b /* Go to epilogue */
3222 +
3223 + .unreq DCT_TABLE
3224 + .unreq COEF_BLOCK
3225 + .unreq OUTPUT_BUF
3226 + .unreq OUTPUT_COL
3227 + .unreq TMP1
3228 + .unreq TMP2
3229 + .unreq TMP3
3230 + .unreq TMP4
3231 +
3232 + .unreq ROW0L
3233 + .unreq ROW0R
3234 + .unreq ROW1L
3235 + .unreq ROW1R
3236 + .unreq ROW2L
3237 + .unreq ROW2R
3238 + .unreq ROW3L
3239 + .unreq ROW3R
3240 + .unreq ROW4L
3241 + .unreq ROW4R
3242 + .unreq ROW5L
3243 + .unreq ROW5R
3244 + .unreq ROW6L
3245 + .unreq ROW6R
3246 + .unreq ROW7L
3247 + .unreq ROW7R
3248 +
3249 +
3250 +/*****************************************************************************/
3251 +
3252 +/*
3253 + * jsimd_idct_ifast_neon
3254 + *
3255 + * This function contains a fast, not so accurate integer implementation of
3256 + * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
3257 + * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
3258 + * function from jidctfst.c
3259 + *
3260 + * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
3261 + * But in ARM NEON case some extra additions are required because VQDMULH
3262 + * instruction can't handle the constants larger than 1. So the expressions
3263 + * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
3264 + * which introduces an extra addition. Overall, there are 6 extra additions
3265 + * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
3266 + */
3267 +
3268 +#define XFIX_1_082392200 v0.4h[0]
3269 +#define XFIX_1_414213562 v0.4h[1]
3270 +#define XFIX_1_847759065 v0.4h[2]
3271 +#define XFIX_2_613125930 v0.4h[3]
3272 +
3273 +.balign 16
3274 +jsimd_idct_ifast_neon_consts:
3275 + .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
3276 + .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
3277 + .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
3278 + .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
3279 +
3280 +asm_function jsimd_idct_ifast_neon
3281 +
3282 + DCT_TABLE .req x0
3283 + COEF_BLOCK .req x1
3284 + OUTPUT_BUF .req x2
3285 + OUTPUT_COL .req x3
3286 + TMP1 .req x0
3287 + TMP2 .req x1
3288 + TMP3 .req x2
3289 + TMP4 .req x22
3290 + TMP5 .req x23
3291 +
3292 + /* Load and dequantize coefficients into NEON registers
3293 + * with the following allocation:
3294 + * 0 1 2 3 | 4 5 6 7
3295 + * ---------+--------
3296 + * 0 | d16 | d17 ( v8.8h )
3297 + * 1 | d18 | d19 ( v9.8h )
3298 + * 2 | d20 | d21 ( v10.8h )
3299 + * 3 | d22 | d23 ( v11.8h )
3300 + * 4 | d24 | d25 ( v12.8h )
3301 + * 5 | d26 | d27 ( v13.8h )
3302 + * 6 | d28 | d29 ( v14.8h )
3303 + * 7 | d30 | d31 ( v15.8h )
3304 + */
3305 + /* Save NEON registers used in fast IDCT */
3306 + sub sp, sp, #176
3307 + stp x22, x23, [sp], 16
3308 + adr x23, jsimd_idct_ifast_neon_consts
3309 + st1 {v0.8b - v3.8b}, [sp], 32
3310 + st1 {v4.8b - v7.8b}, [sp], 32
3311 + st1 {v8.8b - v11.8b}, [sp], 32
3312 + st1 {v12.8b - v15.8b}, [sp], 32
3313 + st1 {v16.8b - v19.8b}, [sp], 32
3314 + ld1 {v8.8h, v9.8h}, [COEF_BLOCK], 32
3315 + ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
3316 + ld1 {v10.8h, v11.8h}, [COEF_BLOCK], 32
3317 + mul v8.8h, v8.8h, v0.8h
3318 + ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32
3319 + mul v9.8h, v9.8h, v1.8h
3320 + ld1 {v12.8h, v13.8h}, [COEF_BLOCK], 32
3321 + mul v10.8h, v10.8h, v2.8h
3322 + ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
3323 + mul v11.8h, v11.8h, v3.8h
3324 + ld1 {v14.8h, v15.8h}, [COEF_BLOCK], 32
3325 + mul v12.8h, v12.8h, v0.8h
3326 + ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32
3327 + mul v14.8h, v14.8h, v2.8h
3328 + mul v13.8h, v13.8h, v1.8h
3329 + ld1 {v0.4h}, [x23] /* load constants */
3330 + mul v15.8h, v15.8h, v3.8h
3331 +
3332 + /* 1-D IDCT, pass 1 */
3333 + sub v2.8h, v10.8h, v14.8h
3334 + add v14.8h, v10.8h, v14.8h
3335 + sub v1.8h, v11.8h, v13.8h
3336 + add v13.8h, v11.8h, v13.8h
3337 + sub v5.8h, v9.8h, v15.8h
3338 + add v15.8h, v9.8h, v15.8h
3339 + sqdmulh v4.8h, v2.8h, XFIX_1_414213562
3340 + sqdmulh v6.8h, v1.8h, XFIX_2_613125930
3341 + add v3.8h, v1.8h, v1.8h
3342 + sub v1.8h, v5.8h, v1.8h
3343 + add v10.8h, v2.8h, v4.8h
3344 + sqdmulh v4.8h, v1.8h, XFIX_1_847759065
3345 + sub v2.8h, v15.8h, v13.8h
3346 + add v3.8h, v3.8h, v6.8h
3347 + sqdmulh v6.8h, v2.8h, XFIX_1_414213562
3348 + add v1.8h, v1.8h, v4.8h
3349 + sqdmulh v4.8h, v5.8h, XFIX_1_082392200
3350 + sub v10.8h, v10.8h, v14.8h
3351 + add v2.8h, v2.8h, v6.8h
3352 + sub v6.8h, v8.8h, v12.8h
3353 + add v12.8h, v8.8h, v12.8h
3354 + add v9.8h, v5.8h, v4.8h
3355 + add v5.8h, v6.8h, v10.8h
3356 + sub v10.8h, v6.8h, v10.8h
3357 + add v6.8h, v15.8h, v13.8h
3358 + add v8.8h, v12.8h, v14.8h
3359 + sub v3.8h, v6.8h, v3.8h
3360 + sub v12.8h, v12.8h, v14.8h
3361 + sub v3.8h, v3.8h, v1.8h
3362 + sub v1.8h, v9.8h, v1.8h
3363 + add v2.8h, v3.8h, v2.8h
3364 + sub v15.8h, v8.8h, v6.8h
3365 + add v1.8h, v1.8h, v2.8h
3366 + add v8.8h, v8.8h, v6.8h
3367 + add v14.8h, v5.8h, v3.8h
3368 + sub v9.8h, v5.8h, v3.8h
3369 + sub v13.8h, v10.8h, v2.8h
3370 + add v10.8h, v10.8h, v2.8h
3371 + /* Transpose q8-q9 */
3372 + mov v18.16b, v8.16b
3373 + trn1 v8.8h, v8.8h, v9.8h
3374 + trn2 v9.8h, v18.8h, v9.8h
3375 + sub v11.8h, v12.8h, v1.8h
3376 + /* Transpose q14-q15 */
3377 + mov v18.16b, v14.16b
3378 + trn1 v14.8h, v14.8h, v15.8h
3379 + trn2 v15.8h, v18.8h, v15.8h
3380 + add v12.8h, v12.8h, v1.8h
3381 + /* Transpose q10-q11 */
3382 + mov v18.16b, v10.16b
3383 + trn1 v10.8h, v10.8h, v11.8h
3384 + trn2 v11.8h, v18.8h, v11.8h
3385 + /* Transpose q12-q13 */
3386 + mov v18.16b, v12.16b
3387 + trn1 v12.8h, v12.8h, v13.8h
3388 + trn2 v13.8h, v18.8h, v13.8h
3389 + /* Transpose q9-q11 */
3390 + mov v18.16b, v9.16b
3391 + trn1 v9.4s, v9.4s, v11.4s
3392 + trn2 v11.4s, v18.4s, v11.4s
3393 + /* Transpose q12-q14 */
3394 + mov v18.16b, v12.16b
3395 + trn1 v12.4s, v12.4s, v14.4s
3396 + trn2 v14.4s, v18.4s, v14.4s
3397 + /* Transpose q8-q10 */
3398 + mov v18.16b, v8.16b
3399 + trn1 v8.4s, v8.4s, v10.4s
3400 + trn2 v10.4s, v18.4s, v10.4s
3401 + /* Transpose q13-q15 */
3402 + mov v18.16b, v13.16b
3403 + trn1 v13.4s, v13.4s, v15.4s
3404 + trn2 v15.4s, v18.4s, v15.4s
3405 + /* vswp v14.4h, v10-MSB.4h */
3406 + umov x22, v14.d[0]
3407 + ins v14.2d[0], v10.2d[1]
3408 + ins v10.2d[1], x22
3409 + /* vswp v13.4h, v9MSB.4h */
3410 +
3411 + umov x22, v13.d[0]
3412 + ins v13.2d[0], v9.2d[1]
3413 + ins v9.2d[1], x22
3414 + /* 1-D IDCT, pass 2 */
3415 + sub v2.8h, v10.8h, v14.8h
3416 + /* vswp v15.4h, v11MSB.4h */
3417 + umov x22, v15.d[0]
3418 + ins v15.2d[0], v11.2d[1]
3419 + ins v11.2d[1], x22
3420 + add v14.8h, v10.8h, v14.8h
3421 + /* vswp v12.4h, v8-MSB.4h */
3422 + umov x22, v12.d[0]
3423 + ins v12.2d[0], v8.2d[1]
3424 + ins v8.2d[1], x22
3425 + sub v1.8h, v11.8h, v13.8h
3426 + add v13.8h, v11.8h, v13.8h
3427 + sub v5.8h, v9.8h, v15.8h
3428 + add v15.8h, v9.8h, v15.8h
3429 + sqdmulh v4.8h, v2.8h, XFIX_1_414213562
3430 + sqdmulh v6.8h, v1.8h, XFIX_2_613125930
3431 + add v3.8h, v1.8h, v1.8h
3432 + sub v1.8h, v5.8h, v1.8h
3433 + add v10.8h, v2.8h, v4.8h
3434 + sqdmulh v4.8h, v1.8h, XFIX_1_847759065
3435 + sub v2.8h, v15.8h, v13.8h
3436 + add v3.8h, v3.8h, v6.8h
3437 + sqdmulh v6.8h, v2.8h, XFIX_1_414213562
3438 + add v1.8h, v1.8h, v4.8h
3439 + sqdmulh v4.8h, v5.8h, XFIX_1_082392200
3440 + sub v10.8h, v10.8h, v14.8h
3441 + add v2.8h, v2.8h, v6.8h
3442 + sub v6.8h, v8.8h, v12.8h
3443 + add v12.8h, v8.8h, v12.8h
3444 + add v9.8h, v5.8h, v4.8h
3445 + add v5.8h, v6.8h, v10.8h
3446 + sub v10.8h, v6.8h, v10.8h
3447 + add v6.8h, v15.8h, v13.8h
3448 + add v8.8h, v12.8h, v14.8h
3449 + sub v3.8h, v6.8h, v3.8h
3450 + sub v12.8h, v12.8h, v14.8h
3451 + sub v3.8h, v3.8h, v1.8h
3452 + sub v1.8h, v9.8h, v1.8h
3453 + add v2.8h, v3.8h, v2.8h
3454 + sub v15.8h, v8.8h, v6.8h
3455 + add v1.8h, v1.8h, v2.8h
3456 + add v8.8h, v8.8h, v6.8h
3457 + add v14.8h, v5.8h, v3.8h
3458 + sub v9.8h, v5.8h, v3.8h
3459 + sub v13.8h, v10.8h, v2.8h
3460 + add v10.8h, v10.8h, v2.8h
3461 + sub v11.8h, v12.8h, v1.8h
3462 + add v12.8h, v12.8h, v1.8h
3463 + /* Descale to 8-bit and range limit */
3464 + movi v0.16b, #0x80
3465 + sqshrn v8.8b, v8.8h, #5
3466 + sqshrn2 v8.16b, v9.8h, #5
3467 + sqshrn v9.8b, v10.8h, #5
3468 + sqshrn2 v9.16b, v11.8h, #5
3469 + sqshrn v10.8b, v12.8h, #5
3470 + sqshrn2 v10.16b, v13.8h, #5
3471 + sqshrn v11.8b, v14.8h, #5
3472 + sqshrn2 v11.16b, v15.8h, #5
3473 + add v8.16b, v8.16b, v0.16b
3474 + add v9.16b, v9.16b, v0.16b
3475 + add v10.16b, v10.16b, v0.16b
3476 + add v11.16b, v11.16b, v0.16b
3477 + /* Transpose the final 8-bit samples */
3478 + /* Transpose q8-q9 */
3479 + mov v18.16b, v8.16b
3480 + trn1 v8.8h, v8.8h, v9.8h
3481 + trn2 v9.8h, v18.8h, v9.8h
3482 + /* Transpose q10-q11 */
3483 + mov v18.16b, v10.16b
3484 + trn1 v10.8h, v10.8h, v11.8h
3485 + trn2 v11.8h, v18.8h, v11.8h
3486 + /* Transpose q8-q10 */
3487 + mov v18.16b, v8.16b
3488 + trn1 v8.4s, v8.4s, v10.4s
3489 + trn2 v10.4s, v18.4s, v10.4s
3490 + /* Transpose q9-q11 */
3491 + mov v18.16b, v9.16b
3492 + trn1 v9.4s, v9.4s, v11.4s
3493 + trn2 v11.4s, v18.4s, v11.4s
3494 + /* make copy */
3495 + ins v17.2d[0], v8.2d[1]
3496 + /* Transpose d16-d17-msb */
3497 + mov v18.16b, v8.16b
3498 + trn1 v8.8b, v8.8b, v17.8b
3499 + trn2 v17.8b, v18.8b, v17.8b
3500 + /* make copy */
3501 + ins v19.2d[0], v9.2d[1]
3502 + mov v18.16b, v9.16b
3503 + trn1 v9.8b, v9.8b, v19.8b
3504 + trn2 v19.8b, v18.8b, v19.8b
3505 + /* Store results to the output buffer */
3506 + ldp TMP1, TMP2, [OUTPUT_BUF], 16
3507 + add TMP1, TMP1, OUTPUT_COL
3508 + add TMP2, TMP2, OUTPUT_COL
3509 + st1 {v8.8b}, [TMP1]
3510 + st1 {v17.8b}, [TMP2]
3511 + ldp TMP1, TMP2, [OUTPUT_BUF], 16
3512 + add TMP1, TMP1, OUTPUT_COL
3513 + add TMP2, TMP2, OUTPUT_COL
3514 + st1 {v9.8b}, [TMP1]
3515 + /* make copy */
3516 + ins v7.2d[0], v10.2d[1]
3517 + mov v18.16b, v10.16b
3518 + trn1 v10.8b, v10.8b, v7.8b
3519 + trn2 v7.8b, v18.8b, v7.8b
3520 + st1 {v19.8b}, [TMP2]
3521 + ldp TMP1, TMP2, [OUTPUT_BUF], 16
3522 + ldp TMP4, TMP5, [OUTPUT_BUF], 16
3523 + add TMP1, TMP1, OUTPUT_COL
3524 + add TMP2, TMP2, OUTPUT_COL
3525 + add TMP4, TMP4, OUTPUT_COL
3526 + add TMP5, TMP5, OUTPUT_COL
3527 + st1 {v10.8b}, [TMP1]
3528 + /* make copy */
3529 + ins v16.2d[0], v11.2d[1]
3530 + mov v18.16b, v11.16b
3531 + trn1 v11.8b, v11.8b, v16.8b
3532 + trn2 v16.8b, v18.8b, v16.8b
3533 + st1 {v7.8b}, [TMP2]
3534 + st1 {v11.8b}, [TMP4]
3535 + st1 {v16.8b}, [TMP5]
3536 + sub sp, sp, #176
3537 + ldp x22, x23, [sp], 16
3538 + ld1 {v0.8b - v3.8b}, [sp], 32
3539 + ld1 {v4.8b - v7.8b}, [sp], 32
3540 + ld1 {v8.8b - v11.8b}, [sp], 32
3541 + ld1 {v12.8b - v15.8b}, [sp], 32
3542 + ld1 {v16.8b - v19.8b}, [sp], 32
3543 + blr x30
3544 +
3545 + .unreq DCT_TABLE
3546 + .unreq COEF_BLOCK
3547 + .unreq OUTPUT_BUF
3548 + .unreq OUTPUT_COL
3549 + .unreq TMP1
3550 + .unreq TMP2
3551 + .unreq TMP3
3552 + .unreq TMP4
3553 +
3554 +
3555 +/*****************************************************************************/
3556 +
3557 +/*
3558 + * jsimd_idct_4x4_neon
3559 + *
3560 + * This function contains inverse-DCT code for getting reduced-size
3561 + * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations
3562 + * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
3563 + * function from jpeg-6b (jidctred.c).
3564 + *
3565 + * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
3566 + * requires much less arithmetic operations and hence should be faster.
3567 + * The primary purpose of this particular NEON optimized function is
3568 + * bit exact compatibility with jpeg-6b.
3569 + *
3570 + * TODO: a bit better instructions scheduling can be achieved by expanding
3571 + * idct_helper/transpose_4x4 macros and reordering instructions,
3572 + * but readability will suffer somewhat.
3573 + */
3574 +
3575 +#define CONST_BITS 13
3576 +
3577 +#define FIX_0_211164243 (1730) /* FIX(0.211164243) */
3578 +#define FIX_0_509795579 (4176) /* FIX(0.509795579) */
3579 +#define FIX_0_601344887 (4926) /* FIX(0.601344887) */
3580 +#define FIX_0_720959822 (5906) /* FIX(0.720959822) */
3581 +#define FIX_0_765366865 (6270) /* FIX(0.765366865) */
3582 +#define FIX_0_850430095 (6967) /* FIX(0.850430095) */
3583 +#define FIX_0_899976223 (7373) /* FIX(0.899976223) */
3584 +#define FIX_1_061594337 (8697) /* FIX(1.061594337) */
3585 +#define FIX_1_272758580 (10426) /* FIX(1.272758580) */
3586 +#define FIX_1_451774981 (11893) /* FIX(1.451774981) */
3587 +#define FIX_1_847759065 (15137) /* FIX(1.847759065) */
3588 +#define FIX_2_172734803 (17799) /* FIX(2.172734803) */
3589 +#define FIX_2_562915447 (20995) /* FIX(2.562915447) */
3590 +#define FIX_3_624509785 (29692) /* FIX(3.624509785) */
3591 +
3592 +.balign 16
3593 +jsimd_idct_4x4_neon_consts:
3594 + .short FIX_1_847759065 /* v0.4h[0] */
3595 + .short -FIX_0_765366865 /* v0.4h[1] */
3596 + .short -FIX_0_211164243 /* v0.4h[2] */
3597 + .short FIX_1_451774981 /* v0.4h[3] */
3598 + .short -FIX_2_172734803 /* d1[0] */
3599 + .short FIX_1_061594337 /* d1[1] */
3600 + .short -FIX_0_509795579 /* d1[2] */
3601 + .short -FIX_0_601344887 /* d1[3] */
3602 + .short FIX_0_899976223 /* v2.4h[0] */
3603 + .short FIX_2_562915447 /* v2.4h[1] */
3604 + .short 1 << (CONST_BITS+1) /* v2.4h[2] */
3605 + .short 0 /* v2.4h[3] */
3606 +
3607 +.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
3608 + smull v28.4s, \x4, v2.4h[2]
3609 + smlal v28.4s, \x8, v0.4h[0]
3610 + smlal v28.4s, \x14, v0.4h[1]
3611 +
3612 + smull v26.4s, \x16, v1.4h[2]
3613 + smlal v26.4s, \x12, v1.4h[3]
3614 + smlal v26.4s, \x10, v2.4h[0]
3615 + smlal v26.4s, \x6, v2.4h[1]
3616 +
3617 + smull v30.4s, \x4, v2.4h[2]
3618 + smlsl v30.4s, \x8, v0.4h[0]
3619 + smlsl v30.4s, \x14, v0.4h[1]
3620 +
3621 + smull v24.4s, \x16, v0.4h[2]
3622 + smlal v24.4s, \x12, v0.4h[3]
3623 + smlal v24.4s, \x10, v1.4h[0]
3624 + smlal v24.4s, \x6, v1.4h[1]
3625 +
3626 + add v20.4s, v28.4s, v26.4s
3627 + sub v28.4s, v28.4s, v26.4s
3628 +
3629 +.if \shift > 16
3630 + srshr v20.4s, v20.4s, #\shift
3631 + srshr v28.4s, v28.4s, #\shift
3632 + xtn \y26, v20.4s
3633 + xtn \y29, v28.4s
3634 +.else
3635 + rshrn \y26, v20.4s, #\shift
3636 + rshrn \y29, v28.4s, #\shift
3637 +.endif
3638 +
3639 + add v20.4s, v30.4s, v24.4s
3640 + sub v30.4s, v30.4s, v24.4s
3641 +
3642 +.if \shift > 16
3643 + srshr v20.4s, v20.4s, #\shift
3644 + srshr v30.4s, v30.4s, #\shift
3645 + xtn \y27, v20.4s
3646 + xtn \y28, v30.4s
3647 +.else
3648 + rshrn \y27, v20.4s, #\shift
3649 + rshrn \y28, v30.4s, #\shift
3650 +.endif
3651 +
3652 +.endm
3653 +
3654 +asm_function jsimd_idct_4x4_neon
3655 +
3656 + DCT_TABLE .req x0
3657 + COEF_BLOCK .req x1
3658 + OUTPUT_BUF .req x2
3659 + OUTPUT_COL .req x3
3660 + TMP1 .req x0
3661 + TMP2 .req x1
3662 + TMP3 .req x2
3663 + TMP4 .req x15
3664 +
3665 + /* Save all used NEON registers */
3666 + sub sp, sp, 272
3667 + str x15, [sp], 16
3668 + /* Load constants (v3.4h is just used for padding) */
3669 + adr TMP4, jsimd_idct_4x4_neon_consts
3670 + st1 {v0.8b - v3.8b}, [sp], 32
3671 + st1 {v4.8b - v7.8b}, [sp], 32
3672 + st1 {v8.8b - v11.8b}, [sp], 32
3673 + st1 {v12.8b - v15.8b}, [sp], 32
3674 + st1 {v16.8b - v19.8b}, [sp], 32
3675 + st1 {v20.8b - v23.8b}, [sp], 32
3676 + st1 {v24.8b - v27.8b}, [sp], 32
3677 + st1 {v28.8b - v31.8b}, [sp], 32
3678 + ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
3679 +
3680 + /* Load all COEF_BLOCK into NEON registers with the following allocation:
3681 + * 0 1 2 3 | 4 5 6 7
3682 + * ---------+--------
3683 + * 0 | v4.4h | v5.4h
3684 + * 1 | v6.4h | v7.4h
3685 + * 2 | v8.4h | v9.4h
3686 + * 3 | v10.4h | v11.4h
3687 + * 4 | - | -
3688 + * 5 | v12.4h | v13.4h
3689 + * 6 | v14.4h | v15.4h
3690 + * 7 | v16.4h | v17.4h
3691 + */
3692 + ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
3693 + ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
3694 + add COEF_BLOCK, COEF_BLOCK, #16
3695 + ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
3696 + ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16
3697 + /* dequantize */
3698 + ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
3699 + mul v4.4h, v4.4h, v18.4h
3700 + mul v5.4h, v5.4h, v19.4h
3701 + ins v4.2d[1], v5.2d[0] /* 128 bit q4 */
3702 + ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
3703 + mul v6.4h, v6.4h, v20.4h
3704 + mul v7.4h, v7.4h, v21.4h
3705 + ins v6.2d[1], v7.2d[0] /* 128 bit q6 */
3706 + mul v8.4h, v8.4h, v22.4h
3707 + mul v9.4h, v9.4h, v23.4h
3708 + ins v8.2d[1], v9.2d[0] /* 128 bit q8 */
3709 + add DCT_TABLE, DCT_TABLE, #16
3710 + ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
3711 + mul v10.4h, v10.4h, v24.4h
3712 + mul v11.4h, v11.4h, v25.4h
3713 + ins v10.2d[1], v11.2d[0] /* 128 bit q10 */
3714 + mul v12.4h, v12.4h, v26.4h
3715 + mul v13.4h, v13.4h, v27.4h
3716 + ins v12.2d[1], v13.2d[0] /* 128 bit q12 */
3717 + ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
3718 + mul v14.4h, v14.4h, v28.4h
3719 + mul v15.4h, v15.4h, v29.4h
3720 + ins v14.2d[1], v15.2d[0] /* 128 bit q14 */
3721 + mul v16.4h, v16.4h, v30.4h
3722 + mul v17.4h, v17.4h, v31.4h
3723 + ins v16.2d[1], v17.2d[0] /* 128 bit q16 */
3724 +
3725 + /* Pass 1 */
3726 + idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4 .4h, v6.4h, v8.4h, v10.4h
3727 + transpose_4x4 v4, v6, v8, v10, v3
3728 + ins v10.2d[1], v11.2d[0]
3729 + idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5 .4h, v7.4h, v9.4h, v11.4h
3730 + transpose_4x4 v5, v7, v9, v11, v3
3731 + ins v10.2d[1], v11.2d[0]
3732 + /* Pass 2 */
3733 + idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26. 4h, v27.4h, v28.4h, v29.4h
3734 + transpose_4x4 v26, v27, v28, v29, v3
3735 +
3736 + /* Range limit */
3737 + movi v30.8h, #0x80
3738 + ins v26.2d[1], v27.2d[0]
3739 + ins v28.2d[1], v29.2d[0]
3740 + add v26.8h, v26.8h, v30.8h
3741 + add v28.8h, v28.8h, v30.8h
3742 + sqxtun v26.8b, v26.8h
3743 + sqxtun v27.8b, v28.8h
3744 +
3745 + /* Store results to the output buffer */
3746 + ldp TMP1, TMP2, [OUTPUT_BUF], 16
3747 + ldp TMP3, TMP4, [OUTPUT_BUF]
3748 + add TMP1, TMP1, OUTPUT_COL
3749 + add TMP2, TMP2, OUTPUT_COL
3750 + add TMP3, TMP3, OUTPUT_COL
3751 + add TMP4, TMP4, OUTPUT_COL
3752 +
3753 +#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
3754 + /* We can use much less instructions on little endian systems if the
3755 + * OS kernel is not configured to trap unaligned memory accesses
3756 + */
3757 + st1 {v26.s}[0], [TMP1], 4
3758 + st1 {v27.s}[0], [TMP3], 4
3759 + st1 {v26.s}[1], [TMP2], 4
3760 + st1 {v27.s}[1], [TMP4], 4
3761 +#else
3762 + st1 {v26.b}[0], [TMP1], 1
3763 + st1 {v27.b}[0], [TMP3], 1
3764 + st1 {v26.b}[1], [TMP1], 1
3765 + st1 {v27.b}[1], [TMP3], 1
3766 + st1 {v26.b}[2], [TMP1], 1
3767 + st1 {v27.b}[2], [TMP3], 1
3768 + st1 {v26.b}[3], [TMP1], 1
3769 + st1 {v27.b}[3], [TMP3], 1
3770 +
3771 + st1 {v26.b}[4], [TMP2], 1
3772 + st1 {v27.b}[4], [TMP4], 1
3773 + st1 {v26.b}[5], [TMP2], 1
3774 + st1 {v27.b}[5], [TMP4], 1
3775 + st1 {v26.b}[6], [TMP2], 1
3776 + st1 {v27.b}[6], [TMP4], 1
3777 + st1 {v26.b}[7], [TMP2], 1
3778 + st1 {v27.b}[7], [TMP4], 1
3779 +#endif
3780 +
3781 + /* vpop {v8.4h - v15.4h} ;not available */
3782 + sub sp, sp, #272
3783 + ldr x15, [sp], 16
3784 + ld1 {v0.8b - v3.8b}, [sp], 32
3785 + ld1 {v4.8b - v7.8b}, [sp], 32
3786 + ld1 {v8.8b - v11.8b}, [sp], 32
3787 + ld1 {v12.8b - v15.8b}, [sp], 32
3788 + ld1 {v16.8b - v19.8b}, [sp], 32
3789 + ld1 {v20.8b - v23.8b}, [sp], 32
3790 + ld1 {v24.8b - v27.8b}, [sp], 32
3791 + ld1 {v28.8b - v31.8b}, [sp], 32
3792 + blr x30
3793 +
3794 + .unreq DCT_TABLE
3795 + .unreq COEF_BLOCK
3796 + .unreq OUTPUT_BUF
3797 + .unreq OUTPUT_COL
3798 + .unreq TMP1
3799 + .unreq TMP2
3800 + .unreq TMP3
3801 + .unreq TMP4
3802 +
3803 +.purgem idct_helper
3804 +
3805 +
3806 +/*****************************************************************************/
3807 +
3808 +/*
3809 + * jsimd_idct_2x2_neon
3810 + *
3811 + * This function contains inverse-DCT code for getting reduced-size
3812 + * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations
3813 + * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
3814 + * function from jpeg-6b (jidctred.c).
3815 + *
3816 + * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
3817 + * requires much less arithmetic operations and hence should be faster.
3818 + * The primary purpose of this particular NEON optimized function is
3819 + * bit exact compatibility with jpeg-6b.
3820 + */
3821 +
3822 +.balign 8
3823 +jsimd_idct_2x2_neon_consts:
3824 + .short -FIX_0_720959822 /* v14[0] */
3825 + .short FIX_0_850430095 /* v14[1] */
3826 + .short -FIX_1_272758580 /* v14[2] */
3827 + .short FIX_3_624509785 /* v14[3] */
3828 +
3829 +.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
3830 + sshll v15.4s, \x4, #15
3831 + smull v26.4s, \x6, v14.4h[3]
3832 + smlal v26.4s, \x10, v14.4h[2]
3833 + smlal v26.4s, \x12, v14.4h[1]
3834 + smlal v26.4s, \x16, v14.4h[0]
3835 +
3836 + add v20.4s, v15.4s, v26.4s
3837 + sub v15.4s, v15.4s, v26.4s
3838 +
3839 +.if \shift > 16
3840 + srshr v20.4s, v20.4s, #\shift
3841 + srshr v15.4s, v15.4s, #\shift
3842 + xtn \y26, v20.4s
3843 + xtn \y27, v15.4s
3844 +.else
3845 + rshrn \y26, v20.4s, #\shift
3846 + rshrn \y27, v15.4s, #\shift
3847 +.endif
3848 +
3849 +.endm
3850 +
3851 +asm_function jsimd_idct_2x2_neon
3852 +
3853 + DCT_TABLE .req x0
3854 + COEF_BLOCK .req x1
3855 + OUTPUT_BUF .req x2
3856 + OUTPUT_COL .req x3
3857 + TMP1 .req x0
3858 + TMP2 .req x15
3859 +
3860 + /* vpush {v8.4h - v15.4h} ; not available */
3861 + sub sp, sp, 208
3862 + str x15, [sp], 16
3863 +
3864 + /* Load constants */
3865 + adr TMP2, jsimd_idct_2x2_neon_consts
3866 + st1 {v4.8b - v7.8b}, [sp], 32
3867 + st1 {v8.8b - v11.8b}, [sp], 32
3868 + st1 {v12.8b - v15.8b}, [sp], 32
3869 + st1 {v16.8b - v19.8b}, [sp], 32
3870 + st1 {v21.8b - v22.8b}, [sp], 16
3871 + st1 {v24.8b - v27.8b}, [sp], 32
3872 + st1 {v30.8b - v31.8b}, [sp], 16
3873 + ld1 {v14.4h}, [TMP2]
3874 +
3875 + /* Load all COEF_BLOCK into NEON registers with the following allocation:
3876 + * 0 1 2 3 | 4 5 6 7
3877 + * ---------+--------
3878 + * 0 | v4.4h | v5.4h
3879 + * 1 | v6.4h | v7.4h
3880 + * 2 | - | -
3881 + * 3 | v10.4h | v11.4h
3882 + * 4 | - | -
3883 + * 5 | v12.4h | v13.4h
3884 + * 6 | - | -
3885 + * 7 | v16.4h | v17.4h
3886 + */
3887 + ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
3888 + add COEF_BLOCK, COEF_BLOCK, #16
3889 + ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16
3890 + add COEF_BLOCK, COEF_BLOCK, #16
3891 + ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16
3892 + add COEF_BLOCK, COEF_BLOCK, #16
3893 + ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16
3894 + /* Dequantize */
3895 + ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
3896 + mul v4.4h, v4.4h, v18.4h
3897 + mul v5.4h, v5.4h, v19.4h
3898 + ins v4.2d[1], v5.2d[0]
3899 + mul v6.4h, v6.4h, v20.4h
3900 + mul v7.4h, v7.4h, v21.4h
3901 + ins v6.2d[1], v7.2d[0]
3902 + add DCT_TABLE, DCT_TABLE, #16
3903 + ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16
3904 + mul v10.4h, v10.4h, v24.4h
3905 + mul v11.4h, v11.4h, v25.4h
3906 + ins v10.2d[1], v11.2d[0]
3907 + add DCT_TABLE, DCT_TABLE, #16
3908 + ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16
3909 + mul v12.4h, v12.4h, v26.4h
3910 + mul v13.4h, v13.4h, v27.4h
3911 + ins v12.2d[1], v13.2d[0]
3912 + add DCT_TABLE, DCT_TABLE, #16
3913 + ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
3914 + mul v16.4h, v16.4h, v30.4h
3915 + mul v17.4h, v17.4h, v31.4h
3916 + ins v16.2d[1], v17.2d[0]
3917 +
3918 + /* Pass 1 */
3919 +#if 0
3920 + idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
3921 + transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h
3922 + idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
3923 + transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h
3924 +#else
3925 + smull v26.4s, v6.4h, v14.4h[3]
3926 + smlal v26.4s, v10.4h, v14.4h[2]
3927 + smlal v26.4s, v12.4h, v14.4h[1]
3928 + smlal v26.4s, v16.4h, v14.4h[0]
3929 + smull v24.4s, v7.4h, v14.4h[3]
3930 + smlal v24.4s, v11.4h, v14.4h[2]
3931 + smlal v24.4s, v13.4h, v14.4h[1]
3932 + smlal v24.4s, v17.4h, v14.4h[0]
3933 + sshll v15.4s, v4.4h, #15
3934 + sshll v30.4s, v5.4h, #15
3935 + add v20.4s, v15.4s, v26.4s
3936 + sub v15.4s, v15.4s, v26.4s
3937 + rshrn v4.4h, v20.4s, #13
3938 + rshrn v6.4h, v15.4s, #13
3939 + add v20.4s, v30.4s, v24.4s
3940 + sub v15.4s, v30.4s, v24.4s
3941 + rshrn v5.4h, v20.4s, #13
3942 + rshrn v7.4h, v15.4s, #13
3943 + ins v4.2d[1], v5.2d[0]
3944 + ins v6.2d[1], v7.2d[0]
3945 + transpose v4, v6, v3, .16b, .8h
3946 + transpose v6, v10, v3, .16b, .4s
3947 + ins v11.2d[0], v10.2d[1]
3948 + ins v7.2d[0], v6.2d[1]
3949 +#endif
3950 +
3951 + /* Pass 2 */
3952 + idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
3953 +
3954 + /* Range limit */
3955 + movi v30.8h, #0x80
3956 + ins v26.2d[1], v27.2d[0]
3957 + add v26.8h, v26.8h, v30.8h
3958 + sqxtun v30.8b, v26.8h
3959 + ins v26.2d[0], v30.2d[0]
3960 + sqxtun v27.8b, v26.8h
3961 +
3962 + /* Store results to the output buffer */
3963 + ldp TMP1, TMP2, [OUTPUT_BUF]
3964 + add TMP1, TMP1, OUTPUT_COL
3965 + add TMP2, TMP2, OUTPUT_COL
3966 +
3967 + st1 {v26.b}[0], [TMP1], 1
3968 + st1 {v27.b}[4], [TMP1], 1
3969 + st1 {v26.b}[1], [TMP2], 1
3970 + st1 {v27.b}[5], [TMP2], 1
3971 +
3972 + sub sp, sp, #208
3973 + ldr x15, [sp], 16
3974 + ld1 {v4.8b - v7.8b}, [sp], 32
3975 + ld1 {v8.8b - v11.8b}, [sp], 32
3976 + ld1 {v12.8b - v15.8b}, [sp], 32
3977 + ld1 {v16.8b - v19.8b}, [sp], 32
3978 + ld1 {v21.8b - v22.8b}, [sp], 16
3979 + ld1 {v24.8b - v27.8b}, [sp], 32
3980 + ld1 {v30.8b - v31.8b}, [sp], 16
3981 + blr x30
3982 +
3983 + .unreq DCT_TABLE
3984 + .unreq COEF_BLOCK
3985 + .unreq OUTPUT_BUF
3986 + .unreq OUTPUT_COL
3987 + .unreq TMP1
3988 + .unreq TMP2
3989 +
3990 +.purgem idct_helper
3991 +
3992 +
3993 +/*****************************************************************************/
3994 +
3995 +/*
3996 + * jsimd_ycc_extrgb_convert_neon
3997 + * jsimd_ycc_extbgr_convert_neon
3998 + * jsimd_ycc_extrgbx_convert_neon
3999 + * jsimd_ycc_extbgrx_convert_neon
4000 + * jsimd_ycc_extxbgr_convert_neon
4001 + * jsimd_ycc_extxrgb_convert_neon
4002 + *
4003 + * Colorspace conversion YCbCr -> RGB
4004 + */
4005 +
4006 +
4007 +.macro do_load size
4008 + .if \size == 8
4009 + ld1 {v4.8b}, [U], 8
4010 + ld1 {v5.8b}, [V], 8
4011 + ld1 {v0.8b}, [Y], 8
4012 + prfm PLDL1KEEP, [U, #64]
4013 + prfm PLDL1KEEP, [V, #64]
4014 + prfm PLDL1KEEP, [Y, #64]
4015 + .elseif \size == 4
4016 + ld1 {v4.b}[0], [U], 1
4017 + ld1 {v4.b}[1], [U], 1
4018 + ld1 {v4.b}[2], [U], 1
4019 + ld1 {v4.b}[3], [U], 1
4020 + ld1 {v5.b}[0], [V], 1
4021 + ld1 {v5.b}[1], [V], 1
4022 + ld1 {v5.b}[2], [V], 1
4023 + ld1 {v5.b}[3], [V], 1
4024 + ld1 {v0.b}[0], [Y], 1
4025 + ld1 {v0.b}[1], [Y], 1
4026 + ld1 {v0.b}[2], [Y], 1
4027 + ld1 {v0.b}[3], [Y], 1
4028 + .elseif \size == 2
4029 + ld1 {v4.b}[4], [U], 1
4030 + ld1 {v4.b}[5], [U], 1
4031 + ld1 {v5.b}[4], [V], 1
4032 + ld1 {v5.b}[5], [V], 1
4033 + ld1 {v0.b}[4], [Y], 1
4034 + ld1 {v0.b}[5], [Y], 1
4035 + .elseif \size == 1
4036 + ld1 {v4.b}[6], [U], 1
4037 + ld1 {v5.b}[6], [V], 1
4038 + ld1 {v0.b}[6], [Y], 1
4039 + .else
4040 + .error unsupported macroblock size
4041 + .endif
4042 +.endm
4043 +
4044 +.macro do_store bpp, size
4045 + .if \bpp == 24
4046 + .if \size == 8
4047 + st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24
4048 + .elseif \size == 4
4049 + st3 {v10.b, v11.b, v12.b}[0], [RGB], 3
4050 + st3 {v10.b, v11.b, v12.b}[1], [RGB], 3
4051 + st3 {v10.b, v11.b, v12.b}[2], [RGB], 3
4052 + st3 {v10.b, v11.b, v12.b}[3], [RGB], 3
4053 + .elseif \size == 2
4054 + st3 {v10.b, v11.b, v12.b}[4], [RGB], 3
4055 + st3 {v10.b, v11.b, v12.b}[5], [RGB], 3
4056 + .elseif \size == 1
4057 + st3 {v10.b, v11.b, v12.b}[6], [RGB], 3
4058 + .else
4059 + .error unsupported macroblock size
4060 + .endif
4061 + .elseif \bpp == 32
4062 + .if \size == 8
4063 + st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
4064 + .elseif \size == 4
4065 + st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
4066 + st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
4067 + st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
4068 + st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
4069 + .elseif \size == 2
4070 + st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
4071 + st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
4072 + .elseif \size == 1
4073 + st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
4074 + .else
4075 + .error unsupported macroblock size
4076 + .endif
4077 + .elseif \bpp==16
4078 + .if \size == 8
4079 + st1 {v25.8h}, [RGB],16
4080 + .elseif \size == 4
4081 + st1 {v25.4h}, [RGB],8
4082 + .elseif \size == 2
4083 + st1 {v25.h}[4], [RGB],2
4084 + st1 {v25.h}[5], [RGB],2
4085 + .elseif \size == 1
4086 + st1 {v25.h}[6], [RGB],2
4087 + .else
4088 + .error unsupported macroblock size
4089 + .endif
4090 + .else
4091 + .error unsupported bpp
4092 + .endif
4093 +.endm
4094 +
4095 +.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize
4096 +
4097 +/*
4098 + * 2-stage pipelined YCbCr->RGB conversion
4099 + */
4100 +
4101 +.macro do_yuv_to_rgb_stage1
4102 + uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */
4103 + uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
4104 + smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
4105 + smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
4106 + smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
4107 + smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
4108 + smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
4109 + smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
4110 + smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */
4111 + smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */
4112 +.endm
4113 +
4114 +.macro do_yuv_to_rgb_stage2
4115 + rshrn v20.4h, v20.4s, #15
4116 + rshrn2 v20.8h, v22.4s, #15
4117 + rshrn v24.4h, v24.4s, #14
4118 + rshrn2 v24.8h, v26.4s, #14
4119 + rshrn v28.4h, v28.4s, #14
4120 + rshrn2 v28.8h, v30.4s, #14
4121 + uaddw v20.8h, v20.8h, v0.8b
4122 + uaddw v24.8h, v24.8h, v0.8b
4123 + uaddw v28.8h, v28.8h, v0.8b
4124 +.if \bpp != 16
4125 + sqxtun v1\g_offs\defsize, v20.8h
4126 + sqxtun v1\r_offs\defsize, v24.8h
4127 + sqxtun v1\b_offs\defsize, v28.8h
4128 +.else
4129 + sqshlu v21.8h, v20.8h, #8
4130 + sqshlu v25.8h, v24.8h, #8
4131 + sqshlu v29.8h, v28.8h, #8
4132 + sri v25.8h, v21.8h, #5
4133 + sri v25.8h, v29.8h, #11
4134 +.endif
4135 +
4136 +.endm
4137 +
4138 +.macro do_yuv_to_rgb_stage2_store_load_stage1
4139 + rshrn v20.4h, v20.4s, #15
4140 + rshrn v24.4h, v24.4s, #14
4141 + rshrn v28.4h, v28.4s, #14
4142 + ld1 {v4.8b}, [U], 8
4143 + rshrn2 v20.8h, v22.4s, #15
4144 + rshrn2 v24.8h, v26.4s, #14
4145 + rshrn2 v28.8h, v30.4s, #14
4146 + ld1 {v5.8b}, [V], 8
4147 + uaddw v20.8h, v20.8h, v0.8b
4148 + uaddw v24.8h, v24.8h, v0.8b
4149 + uaddw v28.8h, v28.8h, v0.8b
4150 +.if \bpp != 16 /**************** rgb24/rgb32 *********************************/
4151 + sqxtun v1\g_offs\defsize, v20.8h
4152 + ld1 {v0.8b}, [Y], 8
4153 + sqxtun v1\r_offs\defsize, v24.8h
4154 + prfm PLDL1KEEP, [U, #64]
4155 + prfm PLDL1KEEP, [V, #64]
4156 + prfm PLDL1KEEP, [Y, #64]
4157 + sqxtun v1\b_offs\defsize, v28.8h
4158 + uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
4159 + uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
4160 + smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
4161 + smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
4162 + smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
4163 + smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
4164 + smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
4165 + smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
4166 +.else /**************************** rgb565 ***********************************/
4167 + sqshlu v21.8h, v20.8h, #8
4168 + sqshlu v25.8h, v24.8h, #8
4169 + sqshlu v29.8h, v28.8h, #8
4170 + uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
4171 + uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
4172 + ld1 {v0.8b}, [Y], 8
4173 + smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
4174 + smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
4175 + smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
4176 + smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
4177 + sri v25.8h, v21.8h, #5
4178 + smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
4179 + smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
4180 + prfm PLDL1KEEP, [U, #64]
4181 + prfm PLDL1KEEP, [V, #64]
4182 + prfm PLDL1KEEP, [Y, #64]
4183 + sri v25.8h, v29.8h, #11
4184 +.endif
4185 + do_store \bpp, 8
4186 + smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */
4187 + smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */
4188 +.endm
4189 +
4190 +.macro do_yuv_to_rgb
4191 + do_yuv_to_rgb_stage1
4192 + do_yuv_to_rgb_stage2
4193 +.endm
4194 +
4195 +/* Apple gas crashes on adrl, work around that by using adr.
4196 + * But this requires a copy of these constants for each function.
4197 + */
4198 +
4199 +.balign 16
4200 +jsimd_ycc_\colorid\()_neon_consts:
4201 + .short 0, 0, 0, 0
4202 + .short 22971, -11277, -23401, 29033
4203 + .short -128, -128, -128, -128
4204 + .short -128, -128, -128, -128
4205 +
4206 +asm_function jsimd_ycc_\colorid\()_convert_neon
4207 + OUTPUT_WIDTH .req x0
4208 + INPUT_BUF .req x1
4209 + INPUT_ROW .req x2
4210 + OUTPUT_BUF .req x3
4211 + NUM_ROWS .req x4
4212 +
4213 + INPUT_BUF0 .req x5
4214 + INPUT_BUF1 .req x6
4215 + INPUT_BUF2 .req INPUT_BUF
4216 +
4217 + RGB .req x7
4218 + Y .req x8
4219 + U .req x9
4220 + V .req x10
4221 + N .req x15
4222 +
4223 + sub sp, sp, 336
4224 + str x15, [sp], 16
4225 + /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
4226 + adr x15, jsimd_ycc_\colorid\()_neon_consts
4227 + /* Save NEON registers */
4228 + st1 {v0.8b - v3.8b}, [sp], 32
4229 + st1 {v4.8b - v7.8b}, [sp], 32
4230 + st1 {v8.8b - v11.8b}, [sp], 32
4231 + st1 {v12.8b - v15.8b}, [sp], 32
4232 + st1 {v16.8b - v19.8b}, [sp], 32
4233 + st1 {v20.8b - v23.8b}, [sp], 32
4234 + st1 {v24.8b - v27.8b}, [sp], 32
4235 + st1 {v28.8b - v31.8b}, [sp], 32
4236 + ld1 {v0.4h, v1.4h}, [x15], 16
4237 + ld1 {v2.8h}, [x15]
4238 +
4239 + /* Save ARM registers and handle input arguments */
4240 + /* push {x4, x5, x6, x7, x8, x9, x10, x30} */
4241 + stp x4, x5, [sp], 16
4242 + stp x6, x7, [sp], 16
4243 + stp x8, x9, [sp], 16
4244 + stp x10, x30, [sp], 16
4245 + ldr INPUT_BUF0, [INPUT_BUF]
4246 + ldr INPUT_BUF1, [INPUT_BUF, 8]
4247 + ldr INPUT_BUF2, [INPUT_BUF, 16]
4248 + .unreq INPUT_BUF
4249 +
4250 + /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
4251 + movi v10.16b, #255
4252 + movi v13.16b, #255
4253 +
4254 + /* Outer loop over scanlines */
4255 + cmp NUM_ROWS, #1
4256 + blt 9f
4257 +0:
4258 + lsl x16, INPUT_ROW, #3
4259 + ldr Y, [INPUT_BUF0, x16]
4260 + ldr U, [INPUT_BUF1, x16]
4261 + mov N, OUTPUT_WIDTH
4262 + ldr V, [INPUT_BUF2, x16]
4263 + add INPUT_ROW, INPUT_ROW, #1
4264 + ldr RGB, [OUTPUT_BUF], #8
4265 +
4266 + /* Inner loop over pixels */
4267 + subs N, N, #8
4268 + blt 3f
4269 + do_load 8
4270 + do_yuv_to_rgb_stage1
4271 + subs N, N, #8
4272 + blt 2f
4273 +1:
4274 + do_yuv_to_rgb_stage2_store_load_stage1
4275 + subs N, N, #8
4276 + bge 1b
4277 +2:
4278 + do_yuv_to_rgb_stage2
4279 + do_store \bpp, 8
4280 + tst N, #7
4281 + beq 8f
4282 +3:
4283 + tst N, #4
4284 + beq 3f
4285 + do_load 4
4286 +3:
4287 + tst N, #2
4288 + beq 4f
4289 + do_load 2
4290 +4:
4291 + tst N, #1
4292 + beq 5f
4293 + do_load 1
4294 +5:
4295 + do_yuv_to_rgb
4296 + tst N, #4
4297 + beq 6f
4298 + do_store \bpp, 4
4299 +6:
4300 + tst N, #2
4301 + beq 7f
4302 + do_store \bpp, 2
4303 +7:
4304 + tst N, #1
4305 + beq 8f
4306 + do_store \bpp, 1
4307 +8:
4308 + subs NUM_ROWS, NUM_ROWS, #1
4309 + bgt 0b
4310 +9:
4311 + /* Restore all registers and return */
4312 + sub sp, sp, #336
4313 + ldr x15, [sp], 16
4314 + ld1 {v0.8b - v3.8b}, [sp], 32
4315 + ld1 {v4.8b - v7.8b}, [sp], 32
4316 + ld1 {v8.8b - v11.8b}, [sp], 32
4317 + ld1 {v12.8b - v15.8b}, [sp], 32
4318 + ld1 {v16.8b - v19.8b}, [sp], 32
4319 + ld1 {v20.8b - v23.8b}, [sp], 32
4320 + ld1 {v24.8b - v27.8b}, [sp], 32
4321 + ld1 {v28.8b - v31.8b}, [sp], 32
4322 + /* pop {r4, r5, r6, r7, r8, r9, r10, pc} */
4323 + ldp x4, x5, [sp], 16
4324 + ldp x6, x7, [sp], 16
4325 + ldp x8, x9, [sp], 16
4326 + ldp x10, x30, [sp], 16
4327 + br x30
4328 + .unreq OUTPUT_WIDTH
4329 + .unreq INPUT_ROW
4330 + .unreq OUTPUT_BUF
4331 + .unreq NUM_ROWS
4332 + .unreq INPUT_BUF0
4333 + .unreq INPUT_BUF1
4334 + .unreq INPUT_BUF2
4335 + .unreq RGB
4336 + .unreq Y
4337 + .unreq U
4338 + .unreq V
4339 + .unreq N
4340 +
4341 +.purgem do_yuv_to_rgb
4342 +.purgem do_yuv_to_rgb_stage1
4343 +.purgem do_yuv_to_rgb_stage2
4344 +.purgem do_yuv_to_rgb_stage2_store_load_stage1
4345 +.endm
4346 +
4347 +/*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize */
4348 +generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b
4349 +generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b
4350 +generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .8b
4351 +generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b
4352 +generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b
4353 +generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b
4354 +generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b
4355 +.purgem do_load
4356 +.purgem do_store
OLDNEW
« no previous file with comments | « README.chromium ('k') | libjpeg.gyp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698