google.patch - Issue 434123003: Add ARM64 SIMD support to libjpeg_turbo

Side by Side Diff: google.patch

Issue 434123003: Add ARM64 SIMD support to libjpeg_turbo (Closed) Base URL: http://src.chromium.org/svn/trunk/deps/third_party/libjpeg_turbo

Patch Set: Add neon fixes Created 6 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 Index: jdmarker.c	1 Index: jdmarker.c

2 ===================================================================	2 ===================================================================

3 --- jdmarker.c (revision 829)	3 --- jdmarker.c (revision 829)

4 +++ jdmarker.c (working copy)	4 +++ jdmarker.c (working copy)

5 @@ -910,7 +910,7 @@	5 @@ -910,7 +910,7 @@

6 }	6 }

7	7

8 if (cinfo->marker->discarded_bytes != 0) {	8 if (cinfo->marker->discarded_bytes != 0) {

9 - WARNMS2(cinfo, JWRN_EXTRANEOUS_DATA, cinfo->marker->discarded_bytes, c);	9 - WARNMS2(cinfo, JWRN_EXTRANEOUS_DATA, cinfo->marker->discarded_bytes, c);

10 + TRACEMS2(cinfo, 1, JWRN_EXTRANEOUS_DATA, cinfo->marker->discarded_bytes, c) ;	10 + TRACEMS2(cinfo, 1, JWRN_EXTRANEOUS_DATA, cinfo->marker->discarded_bytes, c) ;

(...skipping 1920 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1931 code = dctbl->ehufco[nbits];	1931 code = dctbl->ehufco[nbits];

1932 @@ -516,7 +546,7 @@	1932 @@ -516,7 +546,7 @@

1933 temp ^= temp3; \	1933 temp ^= temp3; \

1934 temp -= temp3; \	1934 temp -= temp3; \

1935 temp2 += temp3; \	1935 temp2 += temp3; \

1936 - nbits = jpeg_nbits_table[temp]; \	1936 - nbits = jpeg_nbits_table[temp]; \

1937 + nbits = JPEG_NBITS_NONZERO(temp); \	1937 + nbits = JPEG_NBITS_NONZERO(temp); \

1938 /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \	1938 /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \

1939 while (r > 15) { \	1939 while (r > 15) { \

1940 EMIT_BITS(code_0xf0, size_0xf0) \	1940 EMIT_BITS(code_0xf0, size_0xf0) \

	1941 Index: simd/jsimd_arm64.c

	1942 ===================================================================

	1943 --- /dev/null

	1944 +++ simd/jsimd_arm64.c

	1945 @@ -0,0 +1,544 @@

	1946 +/*

	1947 + * jsimd_arm64.c

	1948 + *

	1949 + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB

	1950 + * Copyright 2009-2011, 2013-2014 D. R. Commander

	1951 + *

	1952 + * Based on the x86 SIMD extension for IJG JPEG library,

	1953 + * Copyright (C) 1999-2006, MIYASAKA Masaru.

	1954 + * For conditions of distribution and use, see copyright notice in jsimdext.inc

	1955 + *

	1956 + * This file contains the interface between the "normal" portions

	1957 + * of the library and the SIMD implementations when running on a

	1958 + * 64-bit ARM architecture.

	1959 + */

	1960 +

	1961 +#define JPEG_INTERNALS

	1962 +#include "../jinclude.h"

	1963 +#include "../jpeglib.h"

	1964 +#include "../jsimd.h"

	1965 +#include "../jdct.h"

	1966 +#include "../jsimddct.h"

	1967 +#include "jsimd.h"

	1968 +

	1969 +#include <stdio.h>

	1970 +#include <string.h>

	1971 +#include <ctype.h>

	1972 +

	1973 +static unsigned int simd_support = ~0;

	1974 +

	1975 +/*

	1976 + * Check what SIMD accelerations are supported.

	1977 + *

	1978 + * FIXME: This code is racy under a multi-threaded environment.

	1979 + */

	1980 +

	1981 +/*

	1982 + * ARMv8 architectures support NEON extensions by default.

	1983 + * It is no longer optional as it was with ARMv7.

	1984 + */

	1985 +

	1986 +

	1987 +LOCAL(void)

	1988 +init_simd (void)

	1989 +{

	1990 + char *env = NULL;

	1991 +

	1992 + if (simd_support != ~0U)

	1993 + return;

	1994 +

	1995 + simd_support = 0;

	1996 +

	1997 + simd_support \|= JSIMD_ARM_NEON;

	1998 +

	1999 + /* Force different settings through environment variables */

	2000 + env = getenv("JSIMD_FORCENEON");

	2001 + if ((env != NULL) && (strcmp(env, "1") == 0))

	2002 + simd_support &= JSIMD_ARM_NEON;

	2003 + env = getenv("JSIMD_FORCENONE");

	2004 + if ((env != NULL) && (strcmp(env, "1") == 0))

	2005 + simd_support = 0;

	2006 +}

	2007 +

	2008 +GLOBAL(int)

	2009 +jsimd_can_rgb_ycc (void)

	2010 +{

	2011 + init_simd();

	2012 +

	2013 + return 0;

	2014 +}

	2015 +

	2016 +GLOBAL(int)

	2017 +jsimd_can_rgb_gray (void)

	2018 +{

	2019 + init_simd();

	2020 +

	2021 + return 0;

	2022 +}

	2023 +

	2024 +GLOBAL(int)

	2025 +jsimd_can_ycc_rgb (void)

	2026 +{

	2027 + init_simd();

	2028 +

	2029 + /* The code is optimised for these values only */

	2030 + if (BITS_IN_JSAMPLE != 8)

	2031 + return 0;

	2032 + if (sizeof(JDIMENSION) != 4)

	2033 + return 0;

	2034 + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))

	2035 + return 0;

	2036 +

	2037 + if (simd_support & JSIMD_ARM_NEON)

	2038 + return 1;

	2039 +

	2040 + return 0;

	2041 +}

	2042 +

	2043 +GLOBAL(int)

	2044 +jsimd_can_ycc_rgb565 (void)

	2045 +{

	2046 + init_simd();

	2047 +

	2048 + /* The code is optimised for these values only */

	2049 + if (BITS_IN_JSAMPLE != 8)

	2050 + return 0;

	2051 + if (sizeof(JDIMENSION) != 4)

	2052 + return 0;

	2053 +

	2054 + if (simd_support & JSIMD_ARM_NEON)

	2055 + return 1;

	2056 +

	2057 + return 0;

	2058 +}

	2059 +

	2060 +GLOBAL(void)

	2061 +jsimd_rgb_ycc_convert (j_compress_ptr cinfo,

	2062 + JSAMPARRAY input_buf, JSAMPIMAGE output_buf,

	2063 + JDIMENSION output_row, int num_rows)

	2064 +{

	2065 +}

	2066 +

	2067 +GLOBAL(void)

	2068 +jsimd_rgb_gray_convert (j_compress_ptr cinfo,

	2069 + JSAMPARRAY input_buf, JSAMPIMAGE output_buf,

	2070 + JDIMENSION output_row, int num_rows)

	2071 +{

	2072 +}

	2073 +

	2074 +GLOBAL(void)

	2075 +jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,

	2076 + JSAMPIMAGE input_buf, JDIMENSION input_row,

	2077 + JSAMPARRAY output_buf, int num_rows)

	2078 +{

	2079 + void (*neonfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);

	2080 +

	2081 + switch(cinfo->out_color_space) {

	2082 + case JCS_EXT_RGB:

	2083 + neonfct=jsimd_ycc_extrgb_convert_neon;

	2084 + break;

	2085 + case JCS_EXT_RGBX:

	2086 + case JCS_EXT_RGBA:

	2087 + neonfct=jsimd_ycc_extrgbx_convert_neon;

	2088 + break;

	2089 + case JCS_EXT_BGR:

	2090 + neonfct=jsimd_ycc_extbgr_convert_neon;

	2091 + break;

	2092 + case JCS_EXT_BGRX:

	2093 + case JCS_EXT_BGRA:

	2094 + neonfct=jsimd_ycc_extbgrx_convert_neon;

	2095 + break;

	2096 + case JCS_EXT_XBGR:

	2097 + case JCS_EXT_ABGR:

	2098 + neonfct=jsimd_ycc_extxbgr_convert_neon;

	2099 + break;

	2100 + case JCS_EXT_XRGB:

	2101 + case JCS_EXT_ARGB:

	2102 + neonfct=jsimd_ycc_extxrgb_convert_neon;

	2103 + break;

	2104 + default:

	2105 + neonfct=jsimd_ycc_extrgb_convert_neon;

	2106 + break;

	2107 + }

	2108 +

	2109 + if (simd_support & JSIMD_ARM_NEON)

	2110 + neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);

	2111 +}

	2112 +

	2113 +GLOBAL(void)

	2114 +jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,

	2115 + JSAMPIMAGE input_buf, JDIMENSION input_row,

	2116 + JSAMPARRAY output_buf, int num_rows)

	2117 +{

	2118 + if (simd_support & JSIMD_ARM_NEON)

	2119 + jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,

	2120 + output_buf, num_rows);

	2121 +}

	2122 +

	2123 +GLOBAL(int)

	2124 +jsimd_can_h2v2_downsample (void)

	2125 +{

	2126 + init_simd();

	2127 +

	2128 + return 0;

	2129 +}

	2130 +

	2131 +GLOBAL(int)

	2132 +jsimd_can_h2v1_downsample (void)

	2133 +{

	2134 + init_simd();

	2135 +

	2136 + return 0;

	2137 +}

	2138 +

	2139 +GLOBAL(void)

	2140 +jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,

	2141 + JSAMPARRAY input_data, JSAMPARRAY output_data)

	2142 +{

	2143 +}

	2144 +

	2145 +GLOBAL(void)

	2146 +jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,

	2147 + JSAMPARRAY input_data, JSAMPARRAY output_data)

	2148 +{

	2149 +}

	2150 +

	2151 +GLOBAL(int)

	2152 +jsimd_can_h2v2_upsample (void)

	2153 +{

	2154 + init_simd();

	2155 +

	2156 + return 0;

	2157 +}

	2158 +

	2159 +GLOBAL(int)

	2160 +jsimd_can_h2v1_upsample (void)

	2161 +{

	2162 + init_simd();

	2163 +

	2164 + return 0;

	2165 +}

	2166 +

	2167 +GLOBAL(void)

	2168 +jsimd_h2v2_upsample (j_decompress_ptr cinfo,

	2169 + jpeg_component_info * compptr,

	2170 + JSAMPARRAY input_data,

	2171 + JSAMPARRAY * output_data_ptr)

	2172 +{

	2173 +}

	2174 +

	2175 +GLOBAL(void)

	2176 +jsimd_h2v1_upsample (j_decompress_ptr cinfo,

	2177 + jpeg_component_info * compptr,

	2178 + JSAMPARRAY input_data,

	2179 + JSAMPARRAY * output_data_ptr)

	2180 +{

	2181 +}

	2182 +

	2183 +GLOBAL(int)

	2184 +jsimd_can_h2v2_fancy_upsample (void)

	2185 +{

	2186 + init_simd();

	2187 +

	2188 + return 0;

	2189 +}

	2190 +

	2191 +GLOBAL(int)

	2192 +jsimd_can_h2v1_fancy_upsample (void)

	2193 +{

	2194 + init_simd();

	2195 +

	2196 + return 0;

	2197 +}

	2198 +

	2199 +GLOBAL(void)

	2200 +jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,

	2201 + jpeg_component_info * compptr,

	2202 + JSAMPARRAY input_data,

	2203 + JSAMPARRAY * output_data_ptr)

	2204 +{

	2205 +}

	2206 +

	2207 +GLOBAL(void)

	2208 +jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,

	2209 + jpeg_component_info * compptr,

	2210 + JSAMPARRAY input_data,

	2211 + JSAMPARRAY * output_data_ptr)

	2212 +{

	2213 +}

	2214 +

	2215 +GLOBAL(int)

	2216 +jsimd_can_h2v2_merged_upsample (void)

	2217 +{

	2218 + init_simd();

	2219 +

	2220 + return 0;

	2221 +}

	2222 +

	2223 +GLOBAL(int)

	2224 +jsimd_can_h2v1_merged_upsample (void)

	2225 +{

	2226 + init_simd();

	2227 +

	2228 + return 0;

	2229 +}

	2230 +

	2231 +GLOBAL(void)

	2232 +jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,

	2233 + JSAMPIMAGE input_buf,

	2234 + JDIMENSION in_row_group_ctr,

	2235 + JSAMPARRAY output_buf)

	2236 +{

	2237 +}

	2238 +

	2239 +GLOBAL(void)

	2240 +jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,

	2241 + JSAMPIMAGE input_buf,

	2242 + JDIMENSION in_row_group_ctr,

	2243 + JSAMPARRAY output_buf)

	2244 +{

	2245 +}

	2246 +

	2247 +GLOBAL(int)

	2248 +jsimd_can_convsamp (void)

	2249 +{

	2250 + init_simd();

	2251 +

	2252 + return 0;

	2253 +}

	2254 +

	2255 +GLOBAL(int)

	2256 +jsimd_can_convsamp_float (void)

	2257 +{

	2258 + init_simd();

	2259 +

	2260 + return 0;

	2261 +}

	2262 +

	2263 +GLOBAL(void)

	2264 +jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,

	2265 + DCTELEM * workspace)

	2266 +{

	2267 +}

	2268 +

	2269 +GLOBAL(void)

	2270 +jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,

	2271 + FAST_FLOAT * workspace)

	2272 +{

	2273 +}

	2274 +

	2275 +GLOBAL(int)

	2276 +jsimd_can_fdct_islow (void)

	2277 +{

	2278 + init_simd();

	2279 +

	2280 + return 0;

	2281 +}

	2282 +

	2283 +GLOBAL(int)

	2284 +jsimd_can_fdct_ifast (void)

	2285 +{

	2286 + init_simd();

	2287 +

	2288 + return 0;

	2289 +}

	2290 +

	2291 +GLOBAL(int)

	2292 +jsimd_can_fdct_float (void)

	2293 +{

	2294 + init_simd();

	2295 +

	2296 + return 0;

	2297 +}

	2298 +

	2299 +GLOBAL(void)

	2300 +jsimd_fdct_islow (DCTELEM * data)

	2301 +{

	2302 +}

	2303 +

	2304 +GLOBAL(void)

	2305 +jsimd_fdct_ifast (DCTELEM * data)

	2306 +{

	2307 +}

	2308 +

	2309 +GLOBAL(void)

	2310 +jsimd_fdct_float (FAST_FLOAT * data)

	2311 +{

	2312 +}

	2313 +

	2314 +GLOBAL(int)

	2315 +jsimd_can_quantize (void)

	2316 +{

	2317 + init_simd();

	2318 +

	2319 + return 0;

	2320 +}

	2321 +

	2322 +GLOBAL(int)

	2323 +jsimd_can_quantize_float (void)

	2324 +{

	2325 + init_simd();

	2326 +

	2327 + return 0;

	2328 +}

	2329 +

	2330 +GLOBAL(void)

	2331 +jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,

	2332 + DCTELEM * workspace)

	2333 +{

	2334 +}

	2335 +

	2336 +GLOBAL(void)

	2337 +jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,

	2338 + FAST_FLOAT * workspace)

	2339 +{

	2340 +}

	2341 +

	2342 +GLOBAL(int)

	2343 +jsimd_can_idct_2x2 (void)

	2344 +{

	2345 + init_simd();

	2346 +

	2347 + /* The code is optimised for these values only */

	2348 + if (DCTSIZE != 8)

	2349 + return 0;

	2350 + if (sizeof(JCOEF) != 2)

	2351 + return 0;

	2352 + if (BITS_IN_JSAMPLE != 8)

	2353 + return 0;

	2354 + if (sizeof(JDIMENSION) != 4)

	2355 + return 0;

	2356 + if (sizeof(ISLOW_MULT_TYPE) != 2)

	2357 + return 0;

	2358 +

	2359 + if (simd_support & JSIMD_ARM_NEON)

	2360 + return 1;

	2361 +

	2362 + return 0;

	2363 +}

	2364 +

	2365 +GLOBAL(int)

	2366 +jsimd_can_idct_4x4 (void)

	2367 +{

	2368 + init_simd();

	2369 +

	2370 + /* The code is optimised for these values only */

	2371 + if (DCTSIZE != 8)

	2372 + return 0;

	2373 + if (sizeof(JCOEF) != 2)

	2374 + return 0;

	2375 + if (BITS_IN_JSAMPLE != 8)

	2376 + return 0;

	2377 + if (sizeof(JDIMENSION) != 4)

	2378 + return 0;

	2379 + if (sizeof(ISLOW_MULT_TYPE) != 2)

	2380 + return 0;

	2381 +

	2382 + if (simd_support & JSIMD_ARM_NEON)

	2383 + return 1;

	2384 +

	2385 + return 0;

	2386 +}

	2387 +

	2388 +GLOBAL(void)

	2389 +jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,

	2390 + JCOEFPTR coef_block, JSAMPARRAY output_buf,

	2391 + JDIMENSION output_col)

	2392 +{

	2393 + if (simd_support & JSIMD_ARM_NEON)

	2394 + jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf,

	2395 + output_col);

	2396 +}

	2397 +

	2398 +GLOBAL(void)

	2399 +jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,

	2400 + JCOEFPTR coef_block, JSAMPARRAY output_buf,

	2401 + JDIMENSION output_col)

	2402 +{

	2403 + if (simd_support & JSIMD_ARM_NEON)

	2404 + jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf,

	2405 + output_col);

	2406 +}

	2407 +

	2408 +GLOBAL(int)

	2409 +jsimd_can_idct_islow (void)

	2410 +{

	2411 + init_simd();

	2412 +

	2413 + /* The code is optimised for these values only */

	2414 + if (DCTSIZE != 8)

	2415 + return 0;

	2416 + if (sizeof(JCOEF) != 2)

	2417 + return 0;

	2418 + if (BITS_IN_JSAMPLE != 8)

	2419 + return 0;

	2420 + if (sizeof(JDIMENSION) != 4)

	2421 + return 0;

	2422 + if (sizeof(ISLOW_MULT_TYPE) != 2)

	2423 + return 0;

	2424 +

	2425 + if (simd_support & JSIMD_ARM_NEON)

	2426 + return 1;

	2427 +

	2428 + return 0;

	2429 +}

	2430 +

	2431 +GLOBAL(int)

	2432 +jsimd_can_idct_ifast (void)

	2433 +{

	2434 + init_simd();

	2435 +

	2436 + /* The code is optimised for these values only */

	2437 + if (DCTSIZE != 8)

	2438 + return 0;

	2439 + if (sizeof(JCOEF) != 2)

	2440 + return 0;

	2441 + if (BITS_IN_JSAMPLE != 8)

	2442 + return 0;

	2443 + if (sizeof(JDIMENSION) != 4)

	2444 + return 0;

	2445 + if (sizeof(IFAST_MULT_TYPE) != 2)

	2446 + return 0;

	2447 + if (IFAST_SCALE_BITS != 2)

	2448 + return 0;

	2449 +

	2450 + if (simd_support & JSIMD_ARM_NEON)

	2451 + return 1;

	2452 +

	2453 + return 0;

	2454 +}

	2455 +

	2456 +GLOBAL(int)

	2457 +jsimd_can_idct_float (void)

	2458 +{

	2459 + init_simd();

	2460 +

	2461 + return 0;

	2462 +}

	2463 +

	2464 +GLOBAL(void)

	2465 +jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,

	2466 + JCOEFPTR coef_block, JSAMPARRAY output_buf,

	2467 + JDIMENSION output_col)

	2468 +{

	2469 + if (simd_support & JSIMD_ARM_NEON)

	2470 + jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,

	2471 + output_col);

	2472 +}

	2473 +

	2474 +GLOBAL(void)

	2475 +jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,

	2476 + JCOEFPTR coef_block, JSAMPARRAY output_buf,

	2477 + JDIMENSION output_col)

	2478 +{

	2479 + if (simd_support & JSIMD_ARM_NEON)

	2480 + jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,

	2481 + output_col);

	2482 +}

	2483 +

	2484 +GLOBAL(void)

	2485 +jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,

	2486 + JCOEFPTR coef_block, JSAMPARRAY output_buf,

	2487 + JDIMENSION output_col)

	2488 +{

	2489 +}

	2490 Index: simd/jsimd_arm64_neon.S

	2491 new file mode 100644

	2492 ===================================================================

	2493 --- /dev/null

	2494 +++ simd/jsimd_arm64_neon.S

	2495 @@ -0,0 +1,1861 @@

	2496 +/*

	2497 + * ARMv8 NEON optimizations for libjpeg-turbo

	2498 + *

	2499 + * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).

	2500 + * All rights reserved.

	2501 + * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>

	2502 + * Copyright (C) 2013-2014, Linaro Limited

	2503 + * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>

	2504 + *

	2505 + * This software is provided 'as-is', without any express or implied

	2506 + * warranty. In no event will the authors be held liable for any damages

	2507 + * arising from the use of this software.

	2508 + *

	2509 + * Permission is granted to anyone to use this software for any purpose,

	2510 + * including commercial applications, and to alter it and redistribute it

	2511 + * freely, subject to the following restrictions:

	2512 + *

	2513 + * 1. The origin of this software must not be misrepresented; you must not

	2514 + * claim that you wrote the original software. If you use this software

	2515 + * in a product, an acknowledgment in the product documentation would be

	2516 + * appreciated but is not required.

	2517 + * 2. Altered source versions must be plainly marked as such, and must not be

	2518 + * misrepresented as being the original software.

	2519 + * 3. This notice may not be removed or altered from any source distribution.

	2520 + */

	2521 +

	2522 +#if defined(__linux__) && defined(__ELF__)

	2523 +.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */

	2524 +#endif

	2525 +

	2526 +.text

	2527 +.arch armv8-a+fp+simd

	2528 +

	2529 +

	2530 +#define RESPECT_STRICT_ALIGNMENT 1

	2531 +

	2532 +

	2533 +/*****************************************************************************/

	2534 +

	2535 +/* Supplementary macro for setting function attributes */

	2536 +.macro asm_function fname

	2537 +#ifdef __APPLE__

	2538 + .globl _\fname

	2539 +_\fname:

	2540 +#else

	2541 + .global \fname

	2542 +#ifdef __ELF__

	2543 + .hidden \fname

	2544 + .type \fname, %function

	2545 +#endif

	2546 +\fname:

	2547 +#endif

	2548 +.endm

	2549 +

	2550 +/* Transpose elements of single 128 bit registers */

	2551 +.macro transpose_single x0,x1,xi,xilen,literal

	2552 + ins \xi\xilen[0], \x0\xilen[0]

	2553 + ins \x1\xilen[0], \x0\xilen[1]

	2554 + trn1 \x0\literal, \x0\literal, \x1\literal

	2555 + trn2 \x1\literal, \xi\literal, \x1\literal

	2556 +.endm

	2557 +

	2558 +/* Transpose elements of 2 differnet registers */

	2559 +.macro transpose x0,x1,xi,xilen,literal

	2560 + mov \xi\xilen, \x0\xilen

	2561 + trn1 \x0\literal, \x0\literal, \x1\literal

	2562 + trn2 \x1\literal, \xi\literal, \x1\literal

	2563 +.endm

	2564 +

	2565 +/* Transpose a block of 4x4 coefficients in four 64-bit registers */

	2566 +.macro transpose_4x4_32 x0,x0len x1,x1len x2,x2len x3,x3len,xi,xilen

	2567 + mov \xi\xilen, \x0\xilen

	2568 + trn1 \x0\x0len, \x0\x0len, \x2\x2len

	2569 + trn2 \x2\x2len, \xi\x0len, \x2\x2len

	2570 + mov \xi\xilen, \x1\xilen

	2571 + trn1 \x1\x1len, \x1\x1len, \x3\x3len

	2572 + trn2 \x3\x3len, \xi\x1len, \x3\x3len

	2573 +.endm

	2574 +

	2575 +.macro transpose_4x4_16 x0,x0len x1,x1len, x2,x2len, x3,x3len,xi,xilen

	2576 + mov \xi\xilen, \x0\xilen

	2577 + trn1 \x0\x0len, \x0\x0len, \x1\x1len

	2578 + trn2 \x1\x2len, \xi\x0len, \x1\x2len

	2579 + mov \xi\xilen, \x2\xilen

	2580 + trn1 \x2\x2len, \x2\x2len, \x3\x3len

	2581 + trn2 \x3\x2len, \xi\x1len, \x3\x3len

	2582 +.endm

	2583 +

	2584 +.macro transpose_4x4 x0, x1, x2, x3,x5

	2585 + transpose_4x4_16 \x0,.4h, \x1,.4h, \x2,.4h,\x3,.4h,\x5,.16b

	2586 + transpose_4x4_32 \x0,.2s, \x1,.2s, \x2,.2s,\x3,.2s,\x5,.16b

	2587 +.endm

	2588 +

	2589 +

	2590 +#define CENTERJSAMPLE 128

	2591 +

	2592 +/*****************************************************************************/

	2593 +

	2594 +/*

	2595 + * Perform dequantization and inverse DCT on one block of coefficients.

	2596 + *

	2597 + * GLOBAL(void)

	2598 + * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block,

	2599 + * JSAMPARRAY output_buf, JDIMENSION output_col)

	2600 + */

	2601 +

	2602 +#define FIX_0_298631336 (2446)

	2603 +#define FIX_0_390180644 (3196)

	2604 +#define FIX_0_541196100 (4433)

	2605 +#define FIX_0_765366865 (6270)

	2606 +#define FIX_0_899976223 (7373)

	2607 +#define FIX_1_175875602 (9633)

	2608 +#define FIX_1_501321110 (12299)

	2609 +#define FIX_1_847759065 (15137)

	2610 +#define FIX_1_961570560 (16069)

	2611 +#define FIX_2_053119869 (16819)

	2612 +#define FIX_2_562915447 (20995)

	2613 +#define FIX_3_072711026 (25172)

	2614 +

	2615 +#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)

	2616 +#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)

	2617 +#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)

	2618 +#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)

	2619 +#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)

	2620 +#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)

	2621 +#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)

	2622 +#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865)

	2623 +

	2624 +/*

	2625 + * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.

	2626 + * Uses some ideas from the comments in 'simd/jiss2int-64.asm'

	2627 + */

	2628 +#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \

	2629 +{ \

	2630 + DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \

	2631 + INT32 q1, q2, q3, q4, q5, q6, q7; \

	2632 + INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \

	2633 + \

	2634 + /* 1-D iDCT input data */ \

	2635 + row0 = xrow0; \

	2636 + row1 = xrow1; \

	2637 + row2 = xrow2; \

	2638 + row3 = xrow3; \

	2639 + row4 = xrow4; \

	2640 + row5 = xrow5; \

	2641 + row6 = xrow6; \

	2642 + row7 = xrow7; \

	2643 + \

	2644 + q5 = row7 + row3; \

	2645 + q4 = row5 + row1; \

	2646 + q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \

	2647 + MULTIPLY(q4, FIX_1_175875602); \

	2648 + q7 = MULTIPLY(q5, FIX_1_175875602) + \

	2649 + MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \

	2650 + q2 = MULTIPLY(row2, FIX_0_541196100) + \

	2651 + MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \

	2652 + q4 = q6; \

	2653 + q3 = ((INT32) row0 - (INT32) row4) << 13; \

	2654 + q6 += MULTIPLY(row5, -FIX_2_562915447) + \

	2655 + MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \

	2656 + /* now we can use q1 (reloadable constants have been used up) */ \

	2657 + q1 = q3 + q2; \

	2658 + q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \

	2659 + MULTIPLY(row1, -FIX_0_899976223); \

	2660 + q5 = q7; \

	2661 + q1 = q1 + q6; \

	2662 + q7 += MULTIPLY(row7, -FIX_0_899976223) + \

	2663 + MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \

	2664 + \

	2665 + /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \

	2666 + tmp11_plus_tmp2 = q1; \

	2667 + row1 = 0; \

	2668 + \

	2669 + q1 = q1 - q6; \

	2670 + q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \

	2671 + MULTIPLY(row3, -FIX_2_562915447); \

	2672 + q1 = q1 - q6; \

	2673 + q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \

	2674 + MULTIPLY(row6, FIX_0_541196100); \

	2675 + q3 = q3 - q2; \

	2676 + \

	2677 + /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \

	2678 + tmp11_minus_tmp2 = q1; \

	2679 + \

	2680 + q1 = ((INT32) row0 + (INT32) row4) << 13; \

	2681 + q2 = q1 + q6; \

	2682 + q1 = q1 - q6; \

	2683 + \

	2684 + /* pick up the results */ \

	2685 + tmp0 = q4; \

	2686 + tmp1 = q5; \

	2687 + tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \

	2688 + tmp3 = q7; \

	2689 + tmp10 = q2; \

	2690 + tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \

	2691 + tmp12 = q3; \

	2692 + tmp13 = q1; \

	2693 +}

	2694 +

	2695 +#define XFIX_0_899976223 v0.4h[0]

	2696 +#define XFIX_0_541196100 v0.4h[1]

	2697 +#define XFIX_2_562915447 v0.4h[2]

	2698 +#define XFIX_0_298631336_MINUS_0_899976223 v0.4h[3]

	2699 +#define XFIX_1_501321110_MINUS_0_899976223 v1.4h[0]

	2700 +#define XFIX_2_053119869_MINUS_2_562915447 v1.4h[1]

	2701 +#define XFIX_0_541196100_PLUS_0_765366865 v1.4h[2]

	2702 +#define XFIX_1_175875602 v1.4h[3]

	2703 +#define XFIX_1_175875602_MINUS_0_390180644 v2.4h[0]

	2704 +#define XFIX_0_541196100_MINUS_1_847759065 v2.4h[1]

	2705 +#define XFIX_3_072711026_MINUS_2_562915447 v2.4h[2]

	2706 +#define XFIX_1_175875602_MINUS_1_961570560 v2.4h[3]

	2707 +

	2708 +.balign 16

	2709 +jsimd_idct_islow_neon_consts:

	2710 + .short FIX_0_899976223 /* d0[0] */

	2711 + .short FIX_0_541196100 /* d0[1] */

	2712 + .short FIX_2_562915447 /* d0[2] */

	2713 + .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */

	2714 + .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */

	2715 + .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */

	2716 + .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */

	2717 + .short FIX_1_175875602 /* d1[3] */

	2718 + /* reloadable constants */

	2719 + .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */

	2720 + .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */

	2721 + .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */

	2722 + .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */

	2723 +

	2724 +asm_function jsimd_idct_islow_neon

	2725 +

	2726 + DCT_TABLE .req x0

	2727 + COEF_BLOCK .req x1

	2728 + OUTPUT_BUF .req x2

	2729 + OUTPUT_COL .req x3

	2730 + TMP1 .req x0

	2731 + TMP2 .req x1

	2732 + TMP3 .req x2

	2733 + TMP4 .req x15

	2734 +

	2735 + ROW0L .req v16

	2736 + ROW0R .req v17

	2737 + ROW1L .req v18

	2738 + ROW1R .req v19

	2739 + ROW2L .req v20

	2740 + ROW2R .req v21

	2741 + ROW3L .req v22

	2742 + ROW3R .req v23

	2743 + ROW4L .req v24

	2744 + ROW4R .req v25

	2745 + ROW5L .req v26

	2746 + ROW5R .req v27

	2747 + ROW6L .req v28

	2748 + ROW6R .req v29

	2749 + ROW7L .req v30

	2750 + ROW7R .req v31

	2751 + /* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */

	2752 + sub sp, sp, 272

	2753 + str x15, [sp], 16

	2754 + adr x15, jsimd_idct_islow_neon_consts

	2755 + st1 {v0.8b - v3.8b}, [sp], 32

	2756 + st1 {v4.8b - v7.8b}, [sp], 32

	2757 + st1 {v8.8b - v11.8b}, [sp], 32

	2758 + st1 {v12.8b - v15.8b}, [sp], 32

	2759 + st1 {v16.8b - v19.8b}, [sp], 32

	2760 + st1 {v20.8b - v23.8b}, [sp], 32

	2761 + st1 {v24.8b - v27.8b}, [sp], 32

	2762 + st1 {v28.8b - v31.8b}, [sp], 32

	2763 + ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32

	2764 + ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32

	2765 + ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32

	2766 + mul v16.4h, v16.4h, v0.4h

	2767 + mul v17.4h, v17.4h, v1.4h

	2768 + ins v16.2d[1], v17.2d[0] /* 128 bit q8 */

	2769 + ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32

	2770 + mul v18.4h, v18.4h, v2.4h

	2771 + mul v19.4h, v19.4h, v3.4h

	2772 + ins v18.2d[1], v19.2d[0] /* 128 bit q9 */

	2773 + ld1 {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32

	2774 + mul v20.4h, v20.4h, v4.4h

	2775 + mul v21.4h, v21.4h, v5.4h

	2776 + ins v20.2d[1], v21.2d[0] /* 128 bit q10 */

	2777 + ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32

	2778 + mul v22.4h, v22.4h, v6.4h

	2779 + mul v23.4h, v23.4h, v7.4h

	2780 + ins v22.2d[1], v23.2d[0] /* 128 bit q11 */

	2781 + ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK]

	2782 + mul v24.4h, v24.4h, v0.4h

	2783 + mul v25.4h, v25.4h, v1.4h

	2784 + ins v24.2d[1], v25.2d[0] /* 128 bit q12 */

	2785 + ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32

	2786 + mul v28.4h, v28.4h, v4.4h

	2787 + mul v29.4h, v29.4h, v5.4h

	2788 + ins v28.2d[1], v29.2d[0] /* 128 bit q14 */

	2789 + mul v26.4h, v26.4h, v2.4h

	2790 + mul v27.4h, v27.4h, v3.4h

	2791 + ins v26.2d[1], v27.2d[0] /* 128 bit q13 */

	2792 + ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x15] /* load constants */

	2793 + add x15, x15, #16

	2794 + mul v30.4h, v30.4h, v6.4h

	2795 + mul v31.4h, v31.4h, v7.4h

	2796 + ins v30.2d[1], v31.2d[0] /* 128 bit q15 */

	2797 + /* Go to the bottom of the stack */

	2798 + sub sp, sp, 352

	2799 + stp x4, x5, [sp], 16

	2800 + st1 {v8.4h - v11.4h}, [sp], 32 /* save NEON registers */

	2801 + st1 {v12.4h - v15.4h}, [sp], 32

	2802 + /* 1-D IDCT, pass 1, left 4x8 half */

	2803 + add v4.4h, ROW7L.4h, ROW3L.4h

	2804 + add v5.4h, ROW5L.4h, ROW1L.4h

	2805 + smull v12.4s, v4.4h, XFIX_1_175875602_MINUS_1_961570560

	2806 + smlal v12.4s, v5.4h, XFIX_1_175875602

	2807 + smull v14.4s, v4.4h, XFIX_1_175875602

	2808 + /* Check for the zero coefficients in the right 4x8 half */

	2809 + smlal v14.4s, v5.4h, XFIX_1_175875602_MINUS_0_390180644

	2810 + ssubl v6.4s, ROW0L.4h, ROW4L.4h

	2811 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]

	2812 + smull v4.4s, ROW2L.4h, XFIX_0_541196100

	2813 + smlal v4.4s, ROW6L.4h, XFIX_0_541196100_MINUS_1_847759065

	2814 + orr x0, x4, x5

	2815 + mov v8.16b, v12.16b

	2816 + smlsl v12.4s, ROW5L.4h, XFIX_2_562915447

	2817 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]

	2818 + smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447

	2819 + shl v6.4s, v6.4s, #13

	2820 + orr x0, x0, x4

	2821 + smlsl v8.4s, ROW1L.4h, XFIX_0_899976223

	2822 + orr x0, x0 , x5

	2823 + add v2.4s, v6.4s, v4.4s

	2824 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]

	2825 + mov v10.16b, v14.16b

	2826 + add v2.4s, v2.4s, v12.4s

	2827 + orr x0, x0, x4

	2828 + smlsl v14.4s, ROW7L.4h, XFIX_0_899976223

	2829 + orr x0, x0, x5

	2830 + smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223

	2831 + rshrn ROW1L.4h, v2.4s, #11

	2832 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]

	2833 + sub v2.4s, v2.4s, v12.4s

	2834 + smlal v10.4s, ROW5L.4h, XFIX_2_053119869_MINUS_2_562915447

	2835 + orr x0, x0, x4

	2836 + smlsl v10.4s, ROW3L.4h, XFIX_2_562915447

	2837 + orr x0, x0, x5

	2838 + sub v2.4s, v2.4s, v12.4s

	2839 + smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865

	2840 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]

	2841 + smlal v12.4s, ROW6L.4h, XFIX_0_541196100

	2842 + sub v6.4s, v6.4s, v4.4s

	2843 + orr x0, x0, x4

	2844 + rshrn ROW6L.4h, v2.4s, #11

	2845 + orr x0, x0, x5

	2846 + add v2.4s, v6.4s, v10.4s

	2847 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]

	2848 + sub v6.4s, v6.4s, v10.4s

	2849 + saddl v10.4s, ROW0L.4h, ROW4L.4h

	2850 + orr x0, x0, x4

	2851 + rshrn ROW2L.4h, v2.4s, #11

	2852 + orr x0, x0, x5

	2853 + rshrn ROW5L.4h, v6.4s, #11

	2854 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]

	2855 + shl v10.4s, v10.4s, #13

	2856 + smlal v8.4s, ROW7L.4h, XFIX_0_298631336_MINUS_0_899976223

	2857 + orr x0, x0, x4

	2858 + add v4.4s, v10.4s, v12.4s

	2859 + orr x0, x0, x5

	2860 + cmp x0, #0 /* orrs instruction removed */

	2861 + sub v2.4s, v10.4s, v12.4s

	2862 + add v12.4s, v4.4s, v14.4s

	2863 + ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]

	2864 + sub v4.4s, v4.4s, v14.4s

	2865 + add v10.4s, v2.4s, v8.4s

	2866 + orr x0, x4, x5

	2867 + sub v6.4s, v2.4s, v8.4s

	2868 + /* pop {x4, x5} */

	2869 + sub sp, sp, 80

	2870 + ldp x4, x5, [sp], 16

	2871 + rshrn ROW7L.4h, v4.4s, #11

	2872 + rshrn ROW3L.4h, v10.4s, #11

	2873 + rshrn ROW0L.4h, v12.4s, #11

	2874 + rshrn ROW4L.4h, v6.4s, #11

	2875 +

	2876 + beq 3f /* Go to do some special handling for the sparse right 4x8 half */

	2877 +

	2878 + /* 1-D IDCT, pass 1, right 4x8 half */

	2879 + ld1 {v2.4h}, [x15] /* reload constants */

	2880 + add v10.4h, ROW7R.4h, ROW3R.4h

	2881 + add v8.4h, ROW5R.4h, ROW1R.4h

	2882 + /* Transpose ROW6L <-> ROW7L (v3 available free register) */

	2883 + transpose ROW6L, ROW7L, v3, .16b, .4h

	2884 + smull v12.4s, v10.4h, XFIX_1_175875602_MINUS_1_961570560

	2885 + smlal v12.4s, v8.4h, XFIX_1_175875602

	2886 + /* Transpose ROW2L <-> ROW3L (v3 available free register) */

	2887 + transpose ROW2L, ROW3L, v3, .16b, .4h

	2888 + smull v14.4s, v10.4h, XFIX_1_175875602

	2889 + smlal v14.4s, v8.4h, XFIX_1_175875602_MINUS_0_390180644

	2890 + /* Transpose ROW0L <-> ROW1L (v3 available free register) */

	2891 + transpose ROW0L, ROW1L, v3, .16b, .4h

	2892 + ssubl v6.4s, ROW0R.4h, ROW4R.4h

	2893 + smull v4.4s, ROW2R.4h, XFIX_0_541196100

	2894 + smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065

	2895 + /* Transpose ROW4L <-> ROW5L (v3 available free register) */

	2896 + transpose ROW4L, ROW5L, v3, .16b, .4h

	2897 + mov v8.16b, v12.16b

	2898 + smlsl v12.4s, ROW5R.4h, XFIX_2_562915447

	2899 + smlal v12.4s, ROW3R.4h, XFIX_3_072711026_MINUS_2_562915447

	2900 + /* Transpose ROW1L <-> ROW3L (v3 available free register) */

	2901 + transpose ROW1L, ROW3L, v3, .16b, .2s

	2902 + shl v6.4s, v6.4s, #13

	2903 + smlsl v8.4s, ROW1R.4h, XFIX_0_899976223

	2904 + /* Transpose ROW4L <-> ROW6L (v3 available free register) */

	2905 + transpose ROW4L, ROW6L, v3, .16b, .2s

	2906 + add v2.4s, v6.4s, v4.4s

	2907 + mov v10.16b, v14.16b

	2908 + add v2.4s, v2.4s, v12.4s

	2909 + /* Transpose ROW0L <-> ROW2L (v3 available free register) */

	2910 + transpose ROW0L, ROW2L, v3, .16b, .2s

	2911 + smlsl v14.4s, ROW7R.4h, XFIX_0_899976223

	2912 + smlal v14.4s, ROW1R.4h, XFIX_1_501321110_MINUS_0_899976223

	2913 + rshrn ROW1R.4h, v2.4s, #11

	2914 + /* Transpose ROW5L <-> ROW7L (v3 available free register) */

	2915 + transpose ROW5L, ROW7L, v3, .16b, .2s

	2916 + sub v2.4s, v2.4s, v12.4s

	2917 + smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447

	2918 + smlsl v10.4s, ROW3R.4h, XFIX_2_562915447

	2919 + sub v2.4s, v2.4s, v12.4s

	2920 + smull v12.4s, ROW2R.4h, XFIX_0_541196100_PLUS_0_765366865

	2921 + smlal v12.4s, ROW6R.4h, XFIX_0_541196100

	2922 + sub v6.4s, v6.4s, v4.4s

	2923 + rshrn ROW6R.4h, v2.4s, #11

	2924 + add v2.4s, v6.4s, v10.4s

	2925 + sub v6.4s, v6.4s, v10.4s

	2926 + saddl v10.4s, ROW0R.4h, ROW4R.4h

	2927 + rshrn ROW2R.4h, v2.4s, #11

	2928 + rshrn ROW5R.4h, v6.4s, #11

	2929 + shl v10.4s, v10.4s, #13

	2930 + smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223

	2931 + add v4.4s, v10.4s, v12.4s

	2932 + sub v2.4s, v10.4s, v12.4s

	2933 + add v12.4s, v4.4s, v14.4s

	2934 + sub v4.4s, v4.4s, v14.4s

	2935 + add v10.4s, v2.4s, v8.4s

	2936 + sub v6.4s, v2.4s, v8.4s

	2937 + rshrn ROW7R.4h, v4.4s, #11

	2938 + rshrn ROW3R.4h, v10.4s, #11

	2939 + rshrn ROW0R.4h, v12.4s, #11

	2940 + rshrn ROW4R.4h, v6.4s, #11

	2941 + /* Transpose right 4x8 half */

	2942 + transpose ROW6R, ROW7R, v3, .16b, .4h

	2943 + transpose ROW2R, ROW3R, v3, .16b, .4h

	2944 + transpose ROW0R, ROW1R, v3, .16b, .4h

	2945 + transpose ROW4R, ROW5R, v3, .16b, .4h

	2946 + transpose ROW1R, ROW3R, v3, .16b, .2s

	2947 + transpose ROW4R, ROW6R, v3, .16b, .2s

	2948 + transpose ROW0R, ROW2R, v3, .16b, .2s

	2949 + transpose ROW5R, ROW7R, v3, .16b, .2s

	2950 +

	2951 +1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */

	2952 + ld1 {v2.4h}, [x15] /* reload constants */

	2953 + smull v12.4S, ROW1R.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R. 4h */

	2954 + smlal v12.4s, ROW1L.4h, XFIX_1_175875602

	2955 + smlal v12.4s, ROW3R.4h, XFIX_1_175875602_MINUS_1_961570560 /* R OW7L.4h <-> ROW3R.4h */

	2956 + smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560

	2957 + smull v14.4s, ROW3R.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R. 4h */

	2958 + smlal v14.4s, ROW3L.4h, XFIX_1_175875602

	2959 + smlal v14.4s, ROW1R.4h, XFIX_1_175875602_MINUS_0_390180644 /* R OW5L.4h <-> ROW1R.4h */

	2960 + smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644

	2961 + ssubl v6.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */

	2962 + smull v4.4s, ROW2L.4h, XFIX_0_541196100

	2963 + smlal v4.4s, ROW2R.4h, XFIX_0_541196100_MINUS_1_847759065 /* R OW6L.4h <-> ROW2R.4h */

	2964 + mov v8.16b, v12.16b

	2965 + smlsl v12.4s, ROW1R.4h, XFIX_2_562915447 /* ROW5L.4h <-> ROW1R. 4h */

	2966 + smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447

	2967 + shl v6.4s, v6.4s, #13

	2968 + smlsl v8.4s, ROW1L.4h, XFIX_0_899976223

	2969 + add v2.4s, v6.4s, v4.4s

	2970 + mov v10.16b, v14.16b

	2971 + add v2.4s, v2.4s, v12.4s

	2972 + smlsl v14.4s, ROW3R.4h, XFIX_0_899976223 /* ROW7L.4h <-> ROW3R. 4h */

	2973 + smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223

	2974 + shrn ROW1L.4h, v2.4s, #16

	2975 + sub v2.4s, v2.4s, v12.4s

	2976 + smlal v10.4s, ROW1R.4h, XFIX_2_053119869_MINUS_2_562915447 /* R OW5L.4h <-> ROW1R.4h */

	2977 + smlsl v10.4s, ROW3L.4h, XFIX_2_562915447

	2978 + sub v2.4s, v2.4s, v12.4s

	2979 + smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865

	2980 + smlal v12.4s, ROW2R.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R. 4h */

	2981 + sub v6.4s, v6.4s, v4.4s

	2982 + shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */

	2983 + add v2.4s, v6.4s, v10.4s

	2984 + sub v6.4s, v6.4s, v10.4s

	2985 + saddl v10.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */

	2986 + shrn ROW2L.4h, v2.4s, #16

	2987 + shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */

	2988 + shl v10.4s, v10.4s, #13

	2989 + smlal v8.4s, ROW3R.4h, XFIX_0_298631336_MINUS_0_899976223 /* R OW7L.4h <-> ROW3R.4h */

	2990 + add v4.4s, v10.4s, v12.4s

	2991 + sub v2.4s, v10.4s, v12.4s

	2992 + add v12.4s, v4.4s, v14.4s

	2993 + sub v4.4s, v4.4s, v14.4s

	2994 + add v10.4s, v2.4s, v8.4s

	2995 + sub v6.4s, v2.4s, v8.4s

	2996 + shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */

	2997 + shrn ROW3L.4h, v10.4s, #16

	2998 + shrn ROW0L.4h, v12.4s, #16

	2999 + shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */

	3000 + /* 1-D IDCT, pass 2, right 4x8 half */

	3001 + ld1 {v2.4h}, [x15] /* reload constants */

	3002 + smull v12.4s, ROW5R.4h, XFIX_1_175875602

	3003 + smlal v12.4s, ROW5L.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R. 4h */

	3004 + smlal v12.4s, ROW7R.4h, XFIX_1_175875602_MINUS_1_961570560

	3005 + smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560 /* R OW7L.4h <-> ROW3R.4h */

	3006 + smull v14.4s, ROW7R.4h, XFIX_1_175875602

	3007 + smlal v14.4s, ROW7L.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R. 4h */

	3008 + smlal v14.4s, ROW5R.4h, XFIX_1_175875602_MINUS_0_390180644

	3009 + smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644 /* R OW5L.4h <-> ROW1R.4h */

	3010 + ssubl v6.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */

	3011 + smull v4.4s, ROW6L.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R. 4h */

	3012 + smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065

	3013 + mov v8.16b, v12.16b

	3014 + smlsl v12.4s, ROW5R.4h, XFIX_2_562915447

	3015 + smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447 /* R OW7L.4h <-> ROW3R.4h */

	3016 + shl v6.4s, v6.4s, #13

	3017 + smlsl v8.4s, ROW5L.4h, XFIX_0_899976223 /* ROW5L.4h <-> ROW1R. 4h */

	3018 + add v2.4s, v6.4s, v4.4s

	3019 + mov v10.16b, v14.16b

	3020 + add v2.4s, v2.4s, v12.4s

	3021 + smlsl v14.4s, ROW7R.4h, XFIX_0_899976223

	3022 + smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223 /* R OW5L.4h <-> ROW1R.4h */

	3023 + shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */

	3024 + sub v2.4s, v2.4s, v12.4s

	3025 + smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447

	3026 + smlsl v10.4s, ROW7L.4h, XFIX_2_562915447 /* ROW7L.4h <-> ROW3R. 4h */

	3027 + sub v2.4s, v2.4s, v12.4s

	3028 + smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865 /* RO W6L.4h <-> ROW2R.4h */

	3029 + smlal v12.4s, ROW6R.4h, XFIX_0_541196100

	3030 + sub v6.4s, v6.4s, v4.4s

	3031 + shrn ROW6R.4h, v2.4s, #16

	3032 + add v2.4s, v6.4s, v10.4s

	3033 + sub v6.4s, v6.4s, v10.4s

	3034 + saddl v10.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */

	3035 + shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */

	3036 + shrn ROW5R.4h, v6.4s, #16

	3037 + shl v10.4s, v10.4s, #13

	3038 + smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223

	3039 + add v4.4s, v10.4s, v12.4s

	3040 + sub v2.4s, v10.4s, v12.4s

	3041 + add v12.4s, v4.4s, v14.4s

	3042 + sub v4.4s, v4.4s, v14.4s

	3043 + add v10.4s, v2.4s, v8.4s

	3044 + sub v6.4s, v2.4s, v8.4s

	3045 + shrn ROW7R.4h, v4.4s, #16

	3046 + shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */

	3047 + shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */

	3048 + shrn ROW4R.4h, v6.4s, #16

	3049 +

	3050 +2: /* Descale to 8-bit and range limit */

	3051 + ins v16.2d[1], v17.2d[0]

	3052 + ins v18.2d[1], v19.2d[0]

	3053 + ins v20.2d[1], v21.2d[0]

	3054 + ins v22.2d[1], v23.2d[0]

	3055 + sqrshrn v16.8b, v16.8h, #2

	3056 + sqrshrn2 v16.16b, v18.8h, #2

	3057 + sqrshrn v18.8b, v20.8h, #2

	3058 + sqrshrn2 v18.16b, v22.8h, #2

	3059 +

	3060 + /* vpop {v8.4h - d15.4h} / / restore NEON registers */

	3061 + ld1 {v8.4h - v11.4h}, [sp], 32

	3062 + ld1 {v12.4h - v15.4h}, [sp], 32

	3063 + ins v24.2d[1], v25.2d[0]

	3064 +

	3065 + sqrshrn v20.8b, v24.8h, #2

	3066 + /* Transpose the final 8-bit samples and do signed->unsigned conversion * /

	3067 + /* trn1 v16.8h, v16.8h, v18.8h */

	3068 + transpose v16, v18, v3, .16b, .8h

	3069 + ins v26.2d[1], v27.2d[0]

	3070 + ins v28.2d[1], v29.2d[0]

	3071 + ins v30.2d[1], v31.2d[0]

	3072 + sqrshrn2 v20.16b, v26.8h, #2

	3073 + sqrshrn v22.8b, v28.8h, #2

	3074 + movi v0.16b, #(CENTERJSAMPLE)

	3075 + sqrshrn2 v22.16b, v30.8h, #2

	3076 + transpose_single v16, v17, v3, .2d, .8b

	3077 + transpose_single v18, v19, v3, .2d, .8b

	3078 + add v16.8b, v16.8b, v0.8b

	3079 + add v17.8b, v17.8b, v0.8b

	3080 + add v18.8b, v18.8b, v0.8b

	3081 + add v19.8b, v19.8b, v0.8b

	3082 + transpose v20, v22, v3, .16b, .8h

	3083 + /* Store results to the output buffer */

	3084 + ldp TMP1, TMP2, [OUTPUT_BUF], 16

	3085 + add TMP1, TMP1, OUTPUT_COL

	3086 + add TMP2, TMP2, OUTPUT_COL

	3087 + st1 {v16.8b}, [TMP1]

	3088 + transpose_single v20, v21, v3, .2d, .8b

	3089 + st1 {v17.8b}, [TMP2]

	3090 + ldp TMP1, TMP2, [OUTPUT_BUF], 16

	3091 + add TMP1, TMP1, OUTPUT_COL

	3092 + add TMP2, TMP2, OUTPUT_COL

	3093 + st1 {v18.8b}, [TMP1]

	3094 + add v20.8b, v20.8b, v0.8b

	3095 + add v21.8b, v21.8b, v0.8b

	3096 + st1 {v19.8b}, [TMP2]

	3097 + ldp TMP1, TMP2, [OUTPUT_BUF], 16

	3098 + ldp TMP3, TMP4, [OUTPUT_BUF]

	3099 + add TMP1, TMP1, OUTPUT_COL

	3100 + add TMP2, TMP2, OUTPUT_COL

	3101 + add TMP3, TMP3, OUTPUT_COL

	3102 + add TMP4, TMP4, OUTPUT_COL

	3103 + transpose_single v22, v23, v3, .2d, .8b

	3104 + st1 {v20.8b}, [TMP1]

	3105 + add v22.8b, v22.8b, v0.8b

	3106 + add v23.8b, v23.8b, v0.8b

	3107 + st1 {v21.8b}, [TMP2]

	3108 + st1 {v22.8b}, [TMP3]

	3109 + st1 {v23.8b}, [TMP4]

	3110 + ldr x15, [sp], 16

	3111 + ld1 {v0.8b - v3.8b}, [sp], 32

	3112 + ld1 {v4.8b - v7.8b}, [sp], 32

	3113 + ld1 {v8.8b - v11.8b}, [sp], 32

	3114 + ld1 {v12.8b - v15.8b}, [sp], 32

	3115 + ld1 {v16.8b - v19.8b}, [sp], 32

	3116 + ld1 {v20.8b - v23.8b}, [sp], 32

	3117 + ld1 {v24.8b - v27.8b}, [sp], 32

	3118 + ld1 {v28.8b - v31.8b}, [sp], 32

	3119 + blr x30

	3120 +

	3121 +3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */

	3122 +

	3123 + /* Transpose left 4x8 half */

	3124 + transpose ROW6L, ROW7L, v3, .16b, .4h

	3125 + transpose ROW2L, ROW3L, v3, .16b, .4h

	3126 + transpose ROW0L, ROW1L, v3, .16b, .4h

	3127 + transpose ROW4L, ROW5L, v3, .16b, .4h

	3128 + shl ROW0R.4h, ROW0R.4h, #2 /* PASS1_BITS */

	3129 + transpose ROW1L, ROW3L, v3, .16b, .2s

	3130 + transpose ROW4L, ROW6L, v3, .16b, .2s

	3131 + transpose ROW0L, ROW2L, v3, .16b, .2s

	3132 + transpose ROW5L, ROW7L, v3, .16b, .2s

	3133 + cmp x0, #0

	3134 + beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second p ass */

	3135 +

	3136 + /* Only row 0 is non-zero for the right 4x8 half */

	3137 + dup ROW1R.4h, ROW0R.4h[1]

	3138 + dup ROW2R.4h, ROW0R.4h[2]

	3139 + dup ROW3R.4h, ROW0R.4h[3]

	3140 + dup ROW4R.4h, ROW0R.4h[0]

	3141 + dup ROW5R.4h, ROW0R.4h[1]

	3142 + dup ROW6R.4h, ROW0R.4h[2]

	3143 + dup ROW7R.4h, ROW0R.4h[3]

	3144 + dup ROW0R.4h, ROW0R.4h[0]

	3145 + b 1b /* Go to 'normal' second pass */

	3146 +

	3147 +4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */

	3148 + ld1 {v2.4h}, [x15] /* reload constants */

	3149 + smull v12.4s, ROW1L.4h, XFIX_1_175875602

	3150 + smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560

	3151 + smull v14.4s, ROW3L.4h, XFIX_1_175875602

	3152 + smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644

	3153 + smull v4.4s, ROW2L.4h, XFIX_0_541196100

	3154 + sshll v6.4s, ROW0L.4h, #13

	3155 + mov v8.16b, v12.16b

	3156 + smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447

	3157 + smlsl v8.4s, ROW1L.4h, XFIX_0_899976223

	3158 + add v2.4s, v6.4s, v4.4s

	3159 + mov v10.16b, v14.16b

	3160 + smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223

	3161 + add v2.4s, v2.4s, v12.4s

	3162 + add v12.4s, v12.4s, v12.4s

	3163 + smlsl v10.4s, ROW3L.4h, XFIX_2_562915447

	3164 + shrn ROW1L.4h, v2.4s, #16

	3165 + sub v2.4s, v2.4s, v12.4s

	3166 + smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865

	3167 + sub v6.4s, v6.4s, v4.4s

	3168 + shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */

	3169 + add v2.4s, v6.4s, v10.4s

	3170 + sub v6.4s, v6.4s, v10.4s

	3171 + sshll v10.4s, ROW0L.4h, #13

	3172 + shrn ROW2L.4h, v2.4s, #16

	3173 + shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */

	3174 + add v4.4s, v10.4s, v12.4s

	3175 + sub v2.4s, v10.4s, v12.4s

	3176 + add v12.4s, v4.4s, v14.4s

	3177 + sub v4.4s, v4.4s, v14.4s

	3178 + add v10.4s, v2.4s, v8.4s

	3179 + sub v6.4s, v2.4s, v8.4s

	3180 + shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */

	3181 + shrn ROW3L.4h, v10.4s, #16

	3182 + shrn ROW0L.4h, v12.4s, #16

	3183 + shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */

	3184 + /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */

	3185 + ld1 {v2.4h}, [x15] /* reload constants */

	3186 + smull v12.4s, ROW5L.4h, XFIX_1_175875602

	3187 + smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560

	3188 + smull v14.4s, ROW7L.4h, XFIX_1_175875602

	3189 + smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644

	3190 + smull v4.4s, ROW6L.4h, XFIX_0_541196100

	3191 + sshll v6.4s, ROW4L.4h, #13

	3192 + mov v8.16b, v12.16b

	3193 + smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447

	3194 + smlsl v8.4s, ROW5L.4h, XFIX_0_899976223

	3195 + add v2.4s, v6.4s, v4.4s

	3196 + mov v10.16b, v14.16b

	3197 + smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223

	3198 + add v2.4s, v2.4s, v12.4s

	3199 + add v12.4s, v12.4s, v12.4s

	3200 + smlsl v10.4s, ROW7L.4h, XFIX_2_562915447

	3201 + shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */

	3202 + sub v2.4s, v2.4s, v12.4s

	3203 + smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865

	3204 + sub v6.4s, v6.4s, v4.4s

	3205 + shrn ROW6R.4h, v2.4s, #16

	3206 + add v2.4s, v6.4s, v10.4s

	3207 + sub v6.4s, v6.4s, v10.4s

	3208 + sshll v10.4s, ROW4L.4h, #13

	3209 + shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */

	3210 + shrn ROW5R.4h, v6.4s, #16

	3211 + add v4.4s, v10.4s, v12.4s

	3212 + sub v2.4s, v10.4s, v12.4s

	3213 + add v12.4s, v4.4s, v14.4s

	3214 + sub v4.4s, v4.4s, v14.4s

	3215 + add v10.4s, v2.4s, v8.4s

	3216 + sub v6.4s, v2.4s, v8.4s

	3217 + shrn ROW7R.4h, v4.4s, #16

	3218 + shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */

	3219 + shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */

	3220 + shrn ROW4R.4h, v6.4s, #16

	3221 + b 2b /* Go to epilogue */

	3222 +

	3223 + .unreq DCT_TABLE

	3224 + .unreq COEF_BLOCK

	3225 + .unreq OUTPUT_BUF

	3226 + .unreq OUTPUT_COL

	3227 + .unreq TMP1

	3228 + .unreq TMP2

	3229 + .unreq TMP3

	3230 + .unreq TMP4

	3231 +

	3232 + .unreq ROW0L

	3233 + .unreq ROW0R

	3234 + .unreq ROW1L

	3235 + .unreq ROW1R

	3236 + .unreq ROW2L

	3237 + .unreq ROW2R

	3238 + .unreq ROW3L

	3239 + .unreq ROW3R

	3240 + .unreq ROW4L

	3241 + .unreq ROW4R

	3242 + .unreq ROW5L

	3243 + .unreq ROW5R

	3244 + .unreq ROW6L

	3245 + .unreq ROW6R

	3246 + .unreq ROW7L

	3247 + .unreq ROW7R

	3248 +

	3249 +

	3250 +/*****************************************************************************/

	3251 +

	3252 +/*

	3253 + * jsimd_idct_ifast_neon

	3254 + *

	3255 + * This function contains a fast, not so accurate integer implementation of

	3256 + * the inverse DCT (Discrete Cosine Transform). It uses the same calculations

	3257 + * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'

	3258 + * function from jidctfst.c

	3259 + *

	3260 + * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.

	3261 + * But in ARM NEON case some extra additions are required because VQDMULH

	3262 + * instruction can't handle the constants larger than 1. So the expressions

	3263 + * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",

	3264 + * which introduces an extra addition. Overall, there are 6 extra additions

	3265 + * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.

	3266 + */

	3267 +

	3268 +#define XFIX_1_082392200 v0.4h[0]

	3269 +#define XFIX_1_414213562 v0.4h[1]

	3270 +#define XFIX_1_847759065 v0.4h[2]

	3271 +#define XFIX_2_613125930 v0.4h[3]

	3272 +

	3273 +.balign 16

	3274 +jsimd_idct_ifast_neon_consts:

	3275 + .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */

	3276 + .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */

	3277 + .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */

	3278 + .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */

	3279 +

	3280 +asm_function jsimd_idct_ifast_neon

	3281 +

	3282 + DCT_TABLE .req x0

	3283 + COEF_BLOCK .req x1

	3284 + OUTPUT_BUF .req x2

	3285 + OUTPUT_COL .req x3

	3286 + TMP1 .req x0

	3287 + TMP2 .req x1

	3288 + TMP3 .req x2

	3289 + TMP4 .req x22

	3290 + TMP5 .req x23

	3291 +

	3292 + /* Load and dequantize coefficients into NEON registers

	3293 + * with the following allocation:

	3294 + * 0 1 2 3 \| 4 5 6 7

	3295 + * ---------+--------

	3296 + * 0 \| d16 \| d17 ( v8.8h )

	3297 + * 1 \| d18 \| d19 ( v9.8h )

	3298 + * 2 \| d20 \| d21 ( v10.8h )

	3299 + * 3 \| d22 \| d23 ( v11.8h )

	3300 + * 4 \| d24 \| d25 ( v12.8h )

	3301 + * 5 \| d26 \| d27 ( v13.8h )

	3302 + * 6 \| d28 \| d29 ( v14.8h )

	3303 + * 7 \| d30 \| d31 ( v15.8h )

	3304 + */

	3305 + /* Save NEON registers used in fast IDCT */

	3306 + sub sp, sp, #176

	3307 + stp x22, x23, [sp], 16

	3308 + adr x23, jsimd_idct_ifast_neon_consts

	3309 + st1 {v0.8b - v3.8b}, [sp], 32

	3310 + st1 {v4.8b - v7.8b}, [sp], 32

	3311 + st1 {v8.8b - v11.8b}, [sp], 32

	3312 + st1 {v12.8b - v15.8b}, [sp], 32

	3313 + st1 {v16.8b - v19.8b}, [sp], 32

	3314 + ld1 {v8.8h, v9.8h}, [COEF_BLOCK], 32

	3315 + ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32

	3316 + ld1 {v10.8h, v11.8h}, [COEF_BLOCK], 32

	3317 + mul v8.8h, v8.8h, v0.8h

	3318 + ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32

	3319 + mul v9.8h, v9.8h, v1.8h

	3320 + ld1 {v12.8h, v13.8h}, [COEF_BLOCK], 32

	3321 + mul v10.8h, v10.8h, v2.8h

	3322 + ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32

	3323 + mul v11.8h, v11.8h, v3.8h

	3324 + ld1 {v14.8h, v15.8h}, [COEF_BLOCK], 32

	3325 + mul v12.8h, v12.8h, v0.8h

	3326 + ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32

	3327 + mul v14.8h, v14.8h, v2.8h

	3328 + mul v13.8h, v13.8h, v1.8h

	3329 + ld1 {v0.4h}, [x23] /* load constants */

	3330 + mul v15.8h, v15.8h, v3.8h

	3331 +

	3332 + /* 1-D IDCT, pass 1 */

	3333 + sub v2.8h, v10.8h, v14.8h

	3334 + add v14.8h, v10.8h, v14.8h

	3335 + sub v1.8h, v11.8h, v13.8h

	3336 + add v13.8h, v11.8h, v13.8h

	3337 + sub v5.8h, v9.8h, v15.8h

	3338 + add v15.8h, v9.8h, v15.8h

	3339 + sqdmulh v4.8h, v2.8h, XFIX_1_414213562

	3340 + sqdmulh v6.8h, v1.8h, XFIX_2_613125930

	3341 + add v3.8h, v1.8h, v1.8h

	3342 + sub v1.8h, v5.8h, v1.8h

	3343 + add v10.8h, v2.8h, v4.8h

	3344 + sqdmulh v4.8h, v1.8h, XFIX_1_847759065

	3345 + sub v2.8h, v15.8h, v13.8h

	3346 + add v3.8h, v3.8h, v6.8h

	3347 + sqdmulh v6.8h, v2.8h, XFIX_1_414213562

	3348 + add v1.8h, v1.8h, v4.8h

	3349 + sqdmulh v4.8h, v5.8h, XFIX_1_082392200

	3350 + sub v10.8h, v10.8h, v14.8h

	3351 + add v2.8h, v2.8h, v6.8h

	3352 + sub v6.8h, v8.8h, v12.8h

	3353 + add v12.8h, v8.8h, v12.8h

	3354 + add v9.8h, v5.8h, v4.8h

	3355 + add v5.8h, v6.8h, v10.8h

	3356 + sub v10.8h, v6.8h, v10.8h

	3357 + add v6.8h, v15.8h, v13.8h

	3358 + add v8.8h, v12.8h, v14.8h

	3359 + sub v3.8h, v6.8h, v3.8h

	3360 + sub v12.8h, v12.8h, v14.8h

	3361 + sub v3.8h, v3.8h, v1.8h

	3362 + sub v1.8h, v9.8h, v1.8h

	3363 + add v2.8h, v3.8h, v2.8h

	3364 + sub v15.8h, v8.8h, v6.8h

	3365 + add v1.8h, v1.8h, v2.8h

	3366 + add v8.8h, v8.8h, v6.8h

	3367 + add v14.8h, v5.8h, v3.8h

	3368 + sub v9.8h, v5.8h, v3.8h

	3369 + sub v13.8h, v10.8h, v2.8h

	3370 + add v10.8h, v10.8h, v2.8h

	3371 + /* Transpose q8-q9 */

	3372 + mov v18.16b, v8.16b

	3373 + trn1 v8.8h, v8.8h, v9.8h

	3374 + trn2 v9.8h, v18.8h, v9.8h

	3375 + sub v11.8h, v12.8h, v1.8h

	3376 + /* Transpose q14-q15 */

	3377 + mov v18.16b, v14.16b

	3378 + trn1 v14.8h, v14.8h, v15.8h

	3379 + trn2 v15.8h, v18.8h, v15.8h

	3380 + add v12.8h, v12.8h, v1.8h

	3381 + /* Transpose q10-q11 */

	3382 + mov v18.16b, v10.16b

	3383 + trn1 v10.8h, v10.8h, v11.8h

	3384 + trn2 v11.8h, v18.8h, v11.8h

	3385 + /* Transpose q12-q13 */

	3386 + mov v18.16b, v12.16b

	3387 + trn1 v12.8h, v12.8h, v13.8h

	3388 + trn2 v13.8h, v18.8h, v13.8h

	3389 + /* Transpose q9-q11 */

	3390 + mov v18.16b, v9.16b

	3391 + trn1 v9.4s, v9.4s, v11.4s

	3392 + trn2 v11.4s, v18.4s, v11.4s

	3393 + /* Transpose q12-q14 */

	3394 + mov v18.16b, v12.16b

	3395 + trn1 v12.4s, v12.4s, v14.4s

	3396 + trn2 v14.4s, v18.4s, v14.4s

	3397 + /* Transpose q8-q10 */

	3398 + mov v18.16b, v8.16b

	3399 + trn1 v8.4s, v8.4s, v10.4s

	3400 + trn2 v10.4s, v18.4s, v10.4s

	3401 + /* Transpose q13-q15 */

	3402 + mov v18.16b, v13.16b

	3403 + trn1 v13.4s, v13.4s, v15.4s

	3404 + trn2 v15.4s, v18.4s, v15.4s

	3405 + /* vswp v14.4h, v10-MSB.4h */

	3406 + umov x22, v14.d[0]

	3407 + ins v14.2d[0], v10.2d[1]

	3408 + ins v10.2d[1], x22

	3409 + /* vswp v13.4h, v9MSB.4h */

	3410 +

	3411 + umov x22, v13.d[0]

	3412 + ins v13.2d[0], v9.2d[1]

	3413 + ins v9.2d[1], x22

	3414 + /* 1-D IDCT, pass 2 */

	3415 + sub v2.8h, v10.8h, v14.8h

	3416 + /* vswp v15.4h, v11MSB.4h */

	3417 + umov x22, v15.d[0]

	3418 + ins v15.2d[0], v11.2d[1]

	3419 + ins v11.2d[1], x22

	3420 + add v14.8h, v10.8h, v14.8h

	3421 + /* vswp v12.4h, v8-MSB.4h */

	3422 + umov x22, v12.d[0]

	3423 + ins v12.2d[0], v8.2d[1]

	3424 + ins v8.2d[1], x22

	3425 + sub v1.8h, v11.8h, v13.8h

	3426 + add v13.8h, v11.8h, v13.8h

	3427 + sub v5.8h, v9.8h, v15.8h

	3428 + add v15.8h, v9.8h, v15.8h

	3429 + sqdmulh v4.8h, v2.8h, XFIX_1_414213562

	3430 + sqdmulh v6.8h, v1.8h, XFIX_2_613125930

	3431 + add v3.8h, v1.8h, v1.8h

	3432 + sub v1.8h, v5.8h, v1.8h

	3433 + add v10.8h, v2.8h, v4.8h

	3434 + sqdmulh v4.8h, v1.8h, XFIX_1_847759065

	3435 + sub v2.8h, v15.8h, v13.8h

	3436 + add v3.8h, v3.8h, v6.8h

	3437 + sqdmulh v6.8h, v2.8h, XFIX_1_414213562

	3438 + add v1.8h, v1.8h, v4.8h

	3439 + sqdmulh v4.8h, v5.8h, XFIX_1_082392200

	3440 + sub v10.8h, v10.8h, v14.8h

	3441 + add v2.8h, v2.8h, v6.8h

	3442 + sub v6.8h, v8.8h, v12.8h

	3443 + add v12.8h, v8.8h, v12.8h

	3444 + add v9.8h, v5.8h, v4.8h

	3445 + add v5.8h, v6.8h, v10.8h

	3446 + sub v10.8h, v6.8h, v10.8h

	3447 + add v6.8h, v15.8h, v13.8h

	3448 + add v8.8h, v12.8h, v14.8h

	3449 + sub v3.8h, v6.8h, v3.8h

	3450 + sub v12.8h, v12.8h, v14.8h

	3451 + sub v3.8h, v3.8h, v1.8h

	3452 + sub v1.8h, v9.8h, v1.8h

	3453 + add v2.8h, v3.8h, v2.8h

	3454 + sub v15.8h, v8.8h, v6.8h

	3455 + add v1.8h, v1.8h, v2.8h

	3456 + add v8.8h, v8.8h, v6.8h

	3457 + add v14.8h, v5.8h, v3.8h

	3458 + sub v9.8h, v5.8h, v3.8h

	3459 + sub v13.8h, v10.8h, v2.8h

	3460 + add v10.8h, v10.8h, v2.8h

	3461 + sub v11.8h, v12.8h, v1.8h

	3462 + add v12.8h, v12.8h, v1.8h

	3463 + /* Descale to 8-bit and range limit */

	3464 + movi v0.16b, #0x80

	3465 + sqshrn v8.8b, v8.8h, #5

	3466 + sqshrn2 v8.16b, v9.8h, #5

	3467 + sqshrn v9.8b, v10.8h, #5

	3468 + sqshrn2 v9.16b, v11.8h, #5

	3469 + sqshrn v10.8b, v12.8h, #5

	3470 + sqshrn2 v10.16b, v13.8h, #5

	3471 + sqshrn v11.8b, v14.8h, #5

	3472 + sqshrn2 v11.16b, v15.8h, #5

	3473 + add v8.16b, v8.16b, v0.16b

	3474 + add v9.16b, v9.16b, v0.16b

	3475 + add v10.16b, v10.16b, v0.16b

	3476 + add v11.16b, v11.16b, v0.16b

	3477 + /* Transpose the final 8-bit samples */

	3478 + /* Transpose q8-q9 */

	3479 + mov v18.16b, v8.16b

	3480 + trn1 v8.8h, v8.8h, v9.8h

	3481 + trn2 v9.8h, v18.8h, v9.8h

	3482 + /* Transpose q10-q11 */

	3483 + mov v18.16b, v10.16b

	3484 + trn1 v10.8h, v10.8h, v11.8h

	3485 + trn2 v11.8h, v18.8h, v11.8h

	3486 + /* Transpose q8-q10 */

	3487 + mov v18.16b, v8.16b

	3488 + trn1 v8.4s, v8.4s, v10.4s

	3489 + trn2 v10.4s, v18.4s, v10.4s

	3490 + /* Transpose q9-q11 */

	3491 + mov v18.16b, v9.16b

	3492 + trn1 v9.4s, v9.4s, v11.4s

	3493 + trn2 v11.4s, v18.4s, v11.4s

	3494 + /* make copy */

	3495 + ins v17.2d[0], v8.2d[1]

	3496 + /* Transpose d16-d17-msb */

	3497 + mov v18.16b, v8.16b

	3498 + trn1 v8.8b, v8.8b, v17.8b

	3499 + trn2 v17.8b, v18.8b, v17.8b

	3500 + /* make copy */

	3501 + ins v19.2d[0], v9.2d[1]

	3502 + mov v18.16b, v9.16b

	3503 + trn1 v9.8b, v9.8b, v19.8b

	3504 + trn2 v19.8b, v18.8b, v19.8b

	3505 + /* Store results to the output buffer */

	3506 + ldp TMP1, TMP2, [OUTPUT_BUF], 16

	3507 + add TMP1, TMP1, OUTPUT_COL

	3508 + add TMP2, TMP2, OUTPUT_COL

	3509 + st1 {v8.8b}, [TMP1]

	3510 + st1 {v17.8b}, [TMP2]

	3511 + ldp TMP1, TMP2, [OUTPUT_BUF], 16

	3512 + add TMP1, TMP1, OUTPUT_COL

	3513 + add TMP2, TMP2, OUTPUT_COL

	3514 + st1 {v9.8b}, [TMP1]

	3515 + /* make copy */

	3516 + ins v7.2d[0], v10.2d[1]

	3517 + mov v18.16b, v10.16b

	3518 + trn1 v10.8b, v10.8b, v7.8b

	3519 + trn2 v7.8b, v18.8b, v7.8b

	3520 + st1 {v19.8b}, [TMP2]

	3521 + ldp TMP1, TMP2, [OUTPUT_BUF], 16

	3522 + ldp TMP4, TMP5, [OUTPUT_BUF], 16

	3523 + add TMP1, TMP1, OUTPUT_COL

	3524 + add TMP2, TMP2, OUTPUT_COL

	3525 + add TMP4, TMP4, OUTPUT_COL

	3526 + add TMP5, TMP5, OUTPUT_COL

	3527 + st1 {v10.8b}, [TMP1]

	3528 + /* make copy */

	3529 + ins v16.2d[0], v11.2d[1]

	3530 + mov v18.16b, v11.16b

	3531 + trn1 v11.8b, v11.8b, v16.8b

	3532 + trn2 v16.8b, v18.8b, v16.8b

	3533 + st1 {v7.8b}, [TMP2]

	3534 + st1 {v11.8b}, [TMP4]

	3535 + st1 {v16.8b}, [TMP5]

	3536 + sub sp, sp, #176

	3537 + ldp x22, x23, [sp], 16

	3538 + ld1 {v0.8b - v3.8b}, [sp], 32

	3539 + ld1 {v4.8b - v7.8b}, [sp], 32

	3540 + ld1 {v8.8b - v11.8b}, [sp], 32

	3541 + ld1 {v12.8b - v15.8b}, [sp], 32

	3542 + ld1 {v16.8b - v19.8b}, [sp], 32

	3543 + blr x30

	3544 +

	3545 + .unreq DCT_TABLE

	3546 + .unreq COEF_BLOCK

	3547 + .unreq OUTPUT_BUF

	3548 + .unreq OUTPUT_COL

	3549 + .unreq TMP1

	3550 + .unreq TMP2

	3551 + .unreq TMP3

	3552 + .unreq TMP4

	3553 +

	3554 +

	3555 +/*****************************************************************************/

	3556 +

	3557 +/*

	3558 + * jsimd_idct_4x4_neon

	3559 + *

	3560 + * This function contains inverse-DCT code for getting reduced-size

	3561 + * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations

	3562 + * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'

	3563 + * function from jpeg-6b (jidctred.c).

	3564 + *

	3565 + * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which

	3566 + * requires much less arithmetic operations and hence should be faster.

	3567 + * The primary purpose of this particular NEON optimized function is

	3568 + * bit exact compatibility with jpeg-6b.

	3569 + *

	3570 + * TODO: a bit better instructions scheduling can be achieved by expanding

	3571 + * idct_helper/transpose_4x4 macros and reordering instructions,

	3572 + * but readability will suffer somewhat.

	3573 + */

	3574 +

	3575 +#define CONST_BITS 13

	3576 +

	3577 +#define FIX_0_211164243 (1730) /* FIX(0.211164243) */

	3578 +#define FIX_0_509795579 (4176) /* FIX(0.509795579) */

	3579 +#define FIX_0_601344887 (4926) /* FIX(0.601344887) */

	3580 +#define FIX_0_720959822 (5906) /* FIX(0.720959822) */

	3581 +#define FIX_0_765366865 (6270) /* FIX(0.765366865) */

	3582 +#define FIX_0_850430095 (6967) /* FIX(0.850430095) */

	3583 +#define FIX_0_899976223 (7373) /* FIX(0.899976223) */

	3584 +#define FIX_1_061594337 (8697) /* FIX(1.061594337) */

	3585 +#define FIX_1_272758580 (10426) /* FIX(1.272758580) */

	3586 +#define FIX_1_451774981 (11893) /* FIX(1.451774981) */

	3587 +#define FIX_1_847759065 (15137) /* FIX(1.847759065) */

	3588 +#define FIX_2_172734803 (17799) /* FIX(2.172734803) */

	3589 +#define FIX_2_562915447 (20995) /* FIX(2.562915447) */

	3590 +#define FIX_3_624509785 (29692) /* FIX(3.624509785) */

	3591 +

	3592 +.balign 16

	3593 +jsimd_idct_4x4_neon_consts:

	3594 + .short FIX_1_847759065 /* v0.4h[0] */

	3595 + .short -FIX_0_765366865 /* v0.4h[1] */

	3596 + .short -FIX_0_211164243 /* v0.4h[2] */

	3597 + .short FIX_1_451774981 /* v0.4h[3] */

	3598 + .short -FIX_2_172734803 /* d1[0] */

	3599 + .short FIX_1_061594337 /* d1[1] */

	3600 + .short -FIX_0_509795579 /* d1[2] */

	3601 + .short -FIX_0_601344887 /* d1[3] */

	3602 + .short FIX_0_899976223 /* v2.4h[0] */

	3603 + .short FIX_2_562915447 /* v2.4h[1] */

	3604 + .short 1 << (CONST_BITS+1) /* v2.4h[2] */

	3605 + .short 0 /* v2.4h[3] */

	3606 +

	3607 +.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29

	3608 + smull v28.4s, \x4, v2.4h[2]

	3609 + smlal v28.4s, \x8, v0.4h[0]

	3610 + smlal v28.4s, \x14, v0.4h[1]

	3611 +

	3612 + smull v26.4s, \x16, v1.4h[2]

	3613 + smlal v26.4s, \x12, v1.4h[3]

	3614 + smlal v26.4s, \x10, v2.4h[0]

	3615 + smlal v26.4s, \x6, v2.4h[1]

	3616 +

	3617 + smull v30.4s, \x4, v2.4h[2]

	3618 + smlsl v30.4s, \x8, v0.4h[0]

	3619 + smlsl v30.4s, \x14, v0.4h[1]

	3620 +

	3621 + smull v24.4s, \x16, v0.4h[2]

	3622 + smlal v24.4s, \x12, v0.4h[3]

	3623 + smlal v24.4s, \x10, v1.4h[0]

	3624 + smlal v24.4s, \x6, v1.4h[1]

	3625 +

	3626 + add v20.4s, v28.4s, v26.4s

	3627 + sub v28.4s, v28.4s, v26.4s

	3628 +

	3629 +.if \shift > 16

	3630 + srshr v20.4s, v20.4s, #\shift

	3631 + srshr v28.4s, v28.4s, #\shift

	3632 + xtn \y26, v20.4s

	3633 + xtn \y29, v28.4s

	3634 +.else

	3635 + rshrn \y26, v20.4s, #\shift

	3636 + rshrn \y29, v28.4s, #\shift

	3637 +.endif

	3638 +

	3639 + add v20.4s, v30.4s, v24.4s

	3640 + sub v30.4s, v30.4s, v24.4s

	3641 +

	3642 +.if \shift > 16

	3643 + srshr v20.4s, v20.4s, #\shift

	3644 + srshr v30.4s, v30.4s, #\shift

	3645 + xtn \y27, v20.4s

	3646 + xtn \y28, v30.4s

	3647 +.else

	3648 + rshrn \y27, v20.4s, #\shift

	3649 + rshrn \y28, v30.4s, #\shift

	3650 +.endif

	3651 +

	3652 +.endm

	3653 +

	3654 +asm_function jsimd_idct_4x4_neon

	3655 +

	3656 + DCT_TABLE .req x0

	3657 + COEF_BLOCK .req x1

	3658 + OUTPUT_BUF .req x2

	3659 + OUTPUT_COL .req x3

	3660 + TMP1 .req x0

	3661 + TMP2 .req x1

	3662 + TMP3 .req x2

	3663 + TMP4 .req x15

	3664 +

	3665 + /* Save all used NEON registers */

	3666 + sub sp, sp, 272

	3667 + str x15, [sp], 16

	3668 + /* Load constants (v3.4h is just used for padding) */

	3669 + adr TMP4, jsimd_idct_4x4_neon_consts

	3670 + st1 {v0.8b - v3.8b}, [sp], 32

	3671 + st1 {v4.8b - v7.8b}, [sp], 32

	3672 + st1 {v8.8b - v11.8b}, [sp], 32

	3673 + st1 {v12.8b - v15.8b}, [sp], 32

	3674 + st1 {v16.8b - v19.8b}, [sp], 32

	3675 + st1 {v20.8b - v23.8b}, [sp], 32

	3676 + st1 {v24.8b - v27.8b}, [sp], 32

	3677 + st1 {v28.8b - v31.8b}, [sp], 32

	3678 + ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]

	3679 +

	3680 + /* Load all COEF_BLOCK into NEON registers with the following allocation:

	3681 + * 0 1 2 3 \| 4 5 6 7

	3682 + * ---------+--------

	3683 + * 0 \| v4.4h \| v5.4h

	3684 + * 1 \| v6.4h \| v7.4h

	3685 + * 2 \| v8.4h \| v9.4h

	3686 + * 3 \| v10.4h \| v11.4h

	3687 + * 4 \| - \| -

	3688 + * 5 \| v12.4h \| v13.4h

	3689 + * 6 \| v14.4h \| v15.4h

	3690 + * 7 \| v16.4h \| v17.4h

	3691 + */

	3692 + ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32

	3693 + ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32

	3694 + add COEF_BLOCK, COEF_BLOCK, #16

	3695 + ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32

	3696 + ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16

	3697 + /* dequantize */

	3698 + ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32

	3699 + mul v4.4h, v4.4h, v18.4h

	3700 + mul v5.4h, v5.4h, v19.4h

	3701 + ins v4.2d[1], v5.2d[0] /* 128 bit q4 */

	3702 + ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32

	3703 + mul v6.4h, v6.4h, v20.4h

	3704 + mul v7.4h, v7.4h, v21.4h

	3705 + ins v6.2d[1], v7.2d[0] /* 128 bit q6 */

	3706 + mul v8.4h, v8.4h, v22.4h

	3707 + mul v9.4h, v9.4h, v23.4h

	3708 + ins v8.2d[1], v9.2d[0] /* 128 bit q8 */

	3709 + add DCT_TABLE, DCT_TABLE, #16

	3710 + ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32

	3711 + mul v10.4h, v10.4h, v24.4h

	3712 + mul v11.4h, v11.4h, v25.4h

	3713 + ins v10.2d[1], v11.2d[0] /* 128 bit q10 */

	3714 + mul v12.4h, v12.4h, v26.4h

	3715 + mul v13.4h, v13.4h, v27.4h

	3716 + ins v12.2d[1], v13.2d[0] /* 128 bit q12 */

	3717 + ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16

	3718 + mul v14.4h, v14.4h, v28.4h

	3719 + mul v15.4h, v15.4h, v29.4h

	3720 + ins v14.2d[1], v15.2d[0] /* 128 bit q14 */

	3721 + mul v16.4h, v16.4h, v30.4h

	3722 + mul v17.4h, v17.4h, v31.4h

	3723 + ins v16.2d[1], v17.2d[0] /* 128 bit q16 */

	3724 +

	3725 + /* Pass 1 */

	3726 + idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4 .4h, v6.4h, v8.4h, v10.4h

	3727 + transpose_4x4 v4, v6, v8, v10, v3

	3728 + ins v10.2d[1], v11.2d[0]

	3729 + idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5 .4h, v7.4h, v9.4h, v11.4h

	3730 + transpose_4x4 v5, v7, v9, v11, v3

	3731 + ins v10.2d[1], v11.2d[0]

	3732 + /* Pass 2 */

	3733 + idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26. 4h, v27.4h, v28.4h, v29.4h

	3734 + transpose_4x4 v26, v27, v28, v29, v3

	3735 +

	3736 + /* Range limit */

	3737 + movi v30.8h, #0x80

	3738 + ins v26.2d[1], v27.2d[0]

	3739 + ins v28.2d[1], v29.2d[0]

	3740 + add v26.8h, v26.8h, v30.8h

	3741 + add v28.8h, v28.8h, v30.8h

	3742 + sqxtun v26.8b, v26.8h

	3743 + sqxtun v27.8b, v28.8h

	3744 +

	3745 + /* Store results to the output buffer */

	3746 + ldp TMP1, TMP2, [OUTPUT_BUF], 16

	3747 + ldp TMP3, TMP4, [OUTPUT_BUF]

	3748 + add TMP1, TMP1, OUTPUT_COL

	3749 + add TMP2, TMP2, OUTPUT_COL

	3750 + add TMP3, TMP3, OUTPUT_COL

	3751 + add TMP4, TMP4, OUTPUT_COL

	3752 +

	3753 +#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT

	3754 + /* We can use much less instructions on little endian systems if the

	3755 + * OS kernel is not configured to trap unaligned memory accesses

	3756 + */

	3757 + st1 {v26.s}[0], [TMP1], 4

	3758 + st1 {v27.s}[0], [TMP3], 4

	3759 + st1 {v26.s}[1], [TMP2], 4

	3760 + st1 {v27.s}[1], [TMP4], 4

	3761 +#else

	3762 + st1 {v26.b}[0], [TMP1], 1

	3763 + st1 {v27.b}[0], [TMP3], 1

	3764 + st1 {v26.b}[1], [TMP1], 1

	3765 + st1 {v27.b}[1], [TMP3], 1

	3766 + st1 {v26.b}[2], [TMP1], 1

	3767 + st1 {v27.b}[2], [TMP3], 1

	3768 + st1 {v26.b}[3], [TMP1], 1

	3769 + st1 {v27.b}[3], [TMP3], 1

	3770 +

	3771 + st1 {v26.b}[4], [TMP2], 1

	3772 + st1 {v27.b}[4], [TMP4], 1

	3773 + st1 {v26.b}[5], [TMP2], 1

	3774 + st1 {v27.b}[5], [TMP4], 1

	3775 + st1 {v26.b}[6], [TMP2], 1

	3776 + st1 {v27.b}[6], [TMP4], 1

	3777 + st1 {v26.b}[7], [TMP2], 1

	3778 + st1 {v27.b}[7], [TMP4], 1

	3779 +#endif

	3780 +

	3781 + /* vpop {v8.4h - v15.4h} ;not available */

	3782 + sub sp, sp, #272

	3783 + ldr x15, [sp], 16

	3784 + ld1 {v0.8b - v3.8b}, [sp], 32

	3785 + ld1 {v4.8b - v7.8b}, [sp], 32

	3786 + ld1 {v8.8b - v11.8b}, [sp], 32

	3787 + ld1 {v12.8b - v15.8b}, [sp], 32

	3788 + ld1 {v16.8b - v19.8b}, [sp], 32

	3789 + ld1 {v20.8b - v23.8b}, [sp], 32

	3790 + ld1 {v24.8b - v27.8b}, [sp], 32

	3791 + ld1 {v28.8b - v31.8b}, [sp], 32

	3792 + blr x30

	3793 +

	3794 + .unreq DCT_TABLE

	3795 + .unreq COEF_BLOCK

	3796 + .unreq OUTPUT_BUF

	3797 + .unreq OUTPUT_COL

	3798 + .unreq TMP1

	3799 + .unreq TMP2

	3800 + .unreq TMP3

	3801 + .unreq TMP4

	3802 +

	3803 +.purgem idct_helper

	3804 +

	3805 +

	3806 +/*****************************************************************************/

	3807 +

	3808 +/*

	3809 + * jsimd_idct_2x2_neon

	3810 + *

	3811 + * This function contains inverse-DCT code for getting reduced-size

	3812 + * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations

	3813 + * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'

	3814 + * function from jpeg-6b (jidctred.c).

	3815 + *

	3816 + * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which

	3817 + * requires much less arithmetic operations and hence should be faster.

	3818 + * The primary purpose of this particular NEON optimized function is

	3819 + * bit exact compatibility with jpeg-6b.

	3820 + */

	3821 +

	3822 +.balign 8

	3823 +jsimd_idct_2x2_neon_consts:

	3824 + .short -FIX_0_720959822 /* v14[0] */

	3825 + .short FIX_0_850430095 /* v14[1] */

	3826 + .short -FIX_1_272758580 /* v14[2] */

	3827 + .short FIX_3_624509785 /* v14[3] */

	3828 +

	3829 +.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27

	3830 + sshll v15.4s, \x4, #15

	3831 + smull v26.4s, \x6, v14.4h[3]

	3832 + smlal v26.4s, \x10, v14.4h[2]

	3833 + smlal v26.4s, \x12, v14.4h[1]

	3834 + smlal v26.4s, \x16, v14.4h[0]

	3835 +

	3836 + add v20.4s, v15.4s, v26.4s

	3837 + sub v15.4s, v15.4s, v26.4s

	3838 +

	3839 +.if \shift > 16

	3840 + srshr v20.4s, v20.4s, #\shift

	3841 + srshr v15.4s, v15.4s, #\shift

	3842 + xtn \y26, v20.4s

	3843 + xtn \y27, v15.4s

	3844 +.else

	3845 + rshrn \y26, v20.4s, #\shift

	3846 + rshrn \y27, v15.4s, #\shift

	3847 +.endif

	3848 +

	3849 +.endm

	3850 +

	3851 +asm_function jsimd_idct_2x2_neon

	3852 +

	3853 + DCT_TABLE .req x0

	3854 + COEF_BLOCK .req x1

	3855 + OUTPUT_BUF .req x2

	3856 + OUTPUT_COL .req x3

	3857 + TMP1 .req x0

	3858 + TMP2 .req x15

	3859 +

	3860 + /* vpush {v8.4h - v15.4h} ; not available */

	3861 + sub sp, sp, 208

	3862 + str x15, [sp], 16

	3863 +

	3864 + /* Load constants */

	3865 + adr TMP2, jsimd_idct_2x2_neon_consts

	3866 + st1 {v4.8b - v7.8b}, [sp], 32

	3867 + st1 {v8.8b - v11.8b}, [sp], 32

	3868 + st1 {v12.8b - v15.8b}, [sp], 32

	3869 + st1 {v16.8b - v19.8b}, [sp], 32

	3870 + st1 {v21.8b - v22.8b}, [sp], 16

	3871 + st1 {v24.8b - v27.8b}, [sp], 32

	3872 + st1 {v30.8b - v31.8b}, [sp], 16

	3873 + ld1 {v14.4h}, [TMP2]

	3874 +

	3875 + /* Load all COEF_BLOCK into NEON registers with the following allocation:

	3876 + * 0 1 2 3 \| 4 5 6 7

	3877 + * ---------+--------

	3878 + * 0 \| v4.4h \| v5.4h

	3879 + * 1 \| v6.4h \| v7.4h

	3880 + * 2 \| - \| -

	3881 + * 3 \| v10.4h \| v11.4h

	3882 + * 4 \| - \| -

	3883 + * 5 \| v12.4h \| v13.4h

	3884 + * 6 \| - \| -

	3885 + * 7 \| v16.4h \| v17.4h

	3886 + */

	3887 + ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32

	3888 + add COEF_BLOCK, COEF_BLOCK, #16

	3889 + ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16

	3890 + add COEF_BLOCK, COEF_BLOCK, #16

	3891 + ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16

	3892 + add COEF_BLOCK, COEF_BLOCK, #16

	3893 + ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16

	3894 + /* Dequantize */

	3895 + ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32

	3896 + mul v4.4h, v4.4h, v18.4h

	3897 + mul v5.4h, v5.4h, v19.4h

	3898 + ins v4.2d[1], v5.2d[0]

	3899 + mul v6.4h, v6.4h, v20.4h

	3900 + mul v7.4h, v7.4h, v21.4h

	3901 + ins v6.2d[1], v7.2d[0]

	3902 + add DCT_TABLE, DCT_TABLE, #16

	3903 + ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16

	3904 + mul v10.4h, v10.4h, v24.4h

	3905 + mul v11.4h, v11.4h, v25.4h

	3906 + ins v10.2d[1], v11.2d[0]

	3907 + add DCT_TABLE, DCT_TABLE, #16

	3908 + ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16

	3909 + mul v12.4h, v12.4h, v26.4h

	3910 + mul v13.4h, v13.4h, v27.4h

	3911 + ins v12.2d[1], v13.2d[0]

	3912 + add DCT_TABLE, DCT_TABLE, #16

	3913 + ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16

	3914 + mul v16.4h, v16.4h, v30.4h

	3915 + mul v17.4h, v17.4h, v31.4h

	3916 + ins v16.2d[1], v17.2d[0]

	3917 +

	3918 + /* Pass 1 */

	3919 +#if 0

	3920 + idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h

	3921 + transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h

	3922 + idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h

	3923 + transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h

	3924 +#else

	3925 + smull v26.4s, v6.4h, v14.4h[3]

	3926 + smlal v26.4s, v10.4h, v14.4h[2]

	3927 + smlal v26.4s, v12.4h, v14.4h[1]

	3928 + smlal v26.4s, v16.4h, v14.4h[0]

	3929 + smull v24.4s, v7.4h, v14.4h[3]

	3930 + smlal v24.4s, v11.4h, v14.4h[2]

	3931 + smlal v24.4s, v13.4h, v14.4h[1]

	3932 + smlal v24.4s, v17.4h, v14.4h[0]

	3933 + sshll v15.4s, v4.4h, #15

	3934 + sshll v30.4s, v5.4h, #15

	3935 + add v20.4s, v15.4s, v26.4s

	3936 + sub v15.4s, v15.4s, v26.4s

	3937 + rshrn v4.4h, v20.4s, #13

	3938 + rshrn v6.4h, v15.4s, #13

	3939 + add v20.4s, v30.4s, v24.4s

	3940 + sub v15.4s, v30.4s, v24.4s

	3941 + rshrn v5.4h, v20.4s, #13

	3942 + rshrn v7.4h, v15.4s, #13

	3943 + ins v4.2d[1], v5.2d[0]

	3944 + ins v6.2d[1], v7.2d[0]

	3945 + transpose v4, v6, v3, .16b, .8h

	3946 + transpose v6, v10, v3, .16b, .4s

	3947 + ins v11.2d[0], v10.2d[1]

	3948 + ins v7.2d[0], v6.2d[1]

	3949 +#endif

	3950 +

	3951 + /* Pass 2 */

	3952 + idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h

	3953 +

	3954 + /* Range limit */

	3955 + movi v30.8h, #0x80

	3956 + ins v26.2d[1], v27.2d[0]

	3957 + add v26.8h, v26.8h, v30.8h

	3958 + sqxtun v30.8b, v26.8h

	3959 + ins v26.2d[0], v30.2d[0]

	3960 + sqxtun v27.8b, v26.8h

	3961 +

	3962 + /* Store results to the output buffer */

	3963 + ldp TMP1, TMP2, [OUTPUT_BUF]

	3964 + add TMP1, TMP1, OUTPUT_COL

	3965 + add TMP2, TMP2, OUTPUT_COL

	3966 +

	3967 + st1 {v26.b}[0], [TMP1], 1

	3968 + st1 {v27.b}[4], [TMP1], 1

	3969 + st1 {v26.b}[1], [TMP2], 1

	3970 + st1 {v27.b}[5], [TMP2], 1

	3971 +

	3972 + sub sp, sp, #208

	3973 + ldr x15, [sp], 16

	3974 + ld1 {v4.8b - v7.8b}, [sp], 32

	3975 + ld1 {v8.8b - v11.8b}, [sp], 32

	3976 + ld1 {v12.8b - v15.8b}, [sp], 32

	3977 + ld1 {v16.8b - v19.8b}, [sp], 32

	3978 + ld1 {v21.8b - v22.8b}, [sp], 16

	3979 + ld1 {v24.8b - v27.8b}, [sp], 32

	3980 + ld1 {v30.8b - v31.8b}, [sp], 16

	3981 + blr x30

	3982 +

	3983 + .unreq DCT_TABLE

	3984 + .unreq COEF_BLOCK

	3985 + .unreq OUTPUT_BUF

	3986 + .unreq OUTPUT_COL

	3987 + .unreq TMP1

	3988 + .unreq TMP2

	3989 +

	3990 +.purgem idct_helper

	3991 +

	3992 +

	3993 +/*****************************************************************************/

	3994 +

	3995 +/*

	3996 + * jsimd_ycc_extrgb_convert_neon

	3997 + * jsimd_ycc_extbgr_convert_neon

	3998 + * jsimd_ycc_extrgbx_convert_neon

	3999 + * jsimd_ycc_extbgrx_convert_neon

	4000 + * jsimd_ycc_extxbgr_convert_neon

	4001 + * jsimd_ycc_extxrgb_convert_neon

	4002 + *

	4003 + * Colorspace conversion YCbCr -> RGB

	4004 + */

	4005 +

	4006 +

	4007 +.macro do_load size

	4008 + .if \size == 8

	4009 + ld1 {v4.8b}, [U], 8

	4010 + ld1 {v5.8b}, [V], 8

	4011 + ld1 {v0.8b}, [Y], 8

	4012 + prfm PLDL1KEEP, [U, #64]

	4013 + prfm PLDL1KEEP, [V, #64]

	4014 + prfm PLDL1KEEP, [Y, #64]

	4015 + .elseif \size == 4

	4016 + ld1 {v4.b}[0], [U], 1

	4017 + ld1 {v4.b}[1], [U], 1

	4018 + ld1 {v4.b}[2], [U], 1

	4019 + ld1 {v4.b}[3], [U], 1

	4020 + ld1 {v5.b}[0], [V], 1

	4021 + ld1 {v5.b}[1], [V], 1

	4022 + ld1 {v5.b}[2], [V], 1

	4023 + ld1 {v5.b}[3], [V], 1

	4024 + ld1 {v0.b}[0], [Y], 1

	4025 + ld1 {v0.b}[1], [Y], 1

	4026 + ld1 {v0.b}[2], [Y], 1

	4027 + ld1 {v0.b}[3], [Y], 1

	4028 + .elseif \size == 2

	4029 + ld1 {v4.b}[4], [U], 1

	4030 + ld1 {v4.b}[5], [U], 1

	4031 + ld1 {v5.b}[4], [V], 1

	4032 + ld1 {v5.b}[5], [V], 1

	4033 + ld1 {v0.b}[4], [Y], 1

	4034 + ld1 {v0.b}[5], [Y], 1

	4035 + .elseif \size == 1

	4036 + ld1 {v4.b}[6], [U], 1

	4037 + ld1 {v5.b}[6], [V], 1

	4038 + ld1 {v0.b}[6], [Y], 1

	4039 + .else

	4040 + .error unsupported macroblock size

	4041 + .endif

	4042 +.endm

	4043 +

	4044 +.macro do_store bpp, size

	4045 + .if \bpp == 24

	4046 + .if \size == 8

	4047 + st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24

	4048 + .elseif \size == 4

	4049 + st3 {v10.b, v11.b, v12.b}[0], [RGB], 3

	4050 + st3 {v10.b, v11.b, v12.b}[1], [RGB], 3

	4051 + st3 {v10.b, v11.b, v12.b}[2], [RGB], 3

	4052 + st3 {v10.b, v11.b, v12.b}[3], [RGB], 3

	4053 + .elseif \size == 2

	4054 + st3 {v10.b, v11.b, v12.b}[4], [RGB], 3

	4055 + st3 {v10.b, v11.b, v12.b}[5], [RGB], 3

	4056 + .elseif \size == 1

	4057 + st3 {v10.b, v11.b, v12.b}[6], [RGB], 3

	4058 + .else

	4059 + .error unsupported macroblock size

	4060 + .endif

	4061 + .elseif \bpp == 32

	4062 + .if \size == 8

	4063 + st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32

	4064 + .elseif \size == 4

	4065 + st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4

	4066 + st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4

	4067 + st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4

	4068 + st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4

	4069 + .elseif \size == 2

	4070 + st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4

	4071 + st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4

	4072 + .elseif \size == 1

	4073 + st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4

	4074 + .else

	4075 + .error unsupported macroblock size

	4076 + .endif

	4077 + .elseif \bpp==16

	4078 + .if \size == 8

	4079 + st1 {v25.8h}, [RGB],16

	4080 + .elseif \size == 4

	4081 + st1 {v25.4h}, [RGB],8

	4082 + .elseif \size == 2

	4083 + st1 {v25.h}[4], [RGB],2

	4084 + st1 {v25.h}[5], [RGB],2

	4085 + .elseif \size == 1

	4086 + st1 {v25.h}[6], [RGB],2

	4087 + .else

	4088 + .error unsupported macroblock size

	4089 + .endif

	4090 + .else

	4091 + .error unsupported bpp

	4092 + .endif

	4093 +.endm

	4094 +

	4095 +.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize

	4096 +

	4097 +/*

	4098 + * 2-stage pipelined YCbCr->RGB conversion

	4099 + */

	4100 +

	4101 +.macro do_yuv_to_rgb_stage1

	4102 + uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */

	4103 + uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */

	4104 + smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */

	4105 + smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */

	4106 + smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */

	4107 + smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */

	4108 + smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */

	4109 + smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */

	4110 + smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */

	4111 + smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */

	4112 +.endm

	4113 +

	4114 +.macro do_yuv_to_rgb_stage2

	4115 + rshrn v20.4h, v20.4s, #15

	4116 + rshrn2 v20.8h, v22.4s, #15

	4117 + rshrn v24.4h, v24.4s, #14

	4118 + rshrn2 v24.8h, v26.4s, #14

	4119 + rshrn v28.4h, v28.4s, #14

	4120 + rshrn2 v28.8h, v30.4s, #14

	4121 + uaddw v20.8h, v20.8h, v0.8b

	4122 + uaddw v24.8h, v24.8h, v0.8b

	4123 + uaddw v28.8h, v28.8h, v0.8b

	4124 +.if \bpp != 16

	4125 + sqxtun v1\g_offs\defsize, v20.8h

	4126 + sqxtun v1\r_offs\defsize, v24.8h

	4127 + sqxtun v1\b_offs\defsize, v28.8h

	4128 +.else

	4129 + sqshlu v21.8h, v20.8h, #8

	4130 + sqshlu v25.8h, v24.8h, #8

	4131 + sqshlu v29.8h, v28.8h, #8

	4132 + sri v25.8h, v21.8h, #5

	4133 + sri v25.8h, v29.8h, #11

	4134 +.endif

	4135 +

	4136 +.endm

	4137 +

	4138 +.macro do_yuv_to_rgb_stage2_store_load_stage1

	4139 + rshrn v20.4h, v20.4s, #15

	4140 + rshrn v24.4h, v24.4s, #14

	4141 + rshrn v28.4h, v28.4s, #14

	4142 + ld1 {v4.8b}, [U], 8

	4143 + rshrn2 v20.8h, v22.4s, #15

	4144 + rshrn2 v24.8h, v26.4s, #14

	4145 + rshrn2 v28.8h, v30.4s, #14

	4146 + ld1 {v5.8b}, [V], 8

	4147 + uaddw v20.8h, v20.8h, v0.8b

	4148 + uaddw v24.8h, v24.8h, v0.8b

	4149 + uaddw v28.8h, v28.8h, v0.8b

	4150 +.if \bpp != 16 /************** rgb24/rgb32 *******************************/

	4151 + sqxtun v1\g_offs\defsize, v20.8h

	4152 + ld1 {v0.8b}, [Y], 8

	4153 + sqxtun v1\r_offs\defsize, v24.8h

	4154 + prfm PLDL1KEEP, [U, #64]

	4155 + prfm PLDL1KEEP, [V, #64]

	4156 + prfm PLDL1KEEP, [Y, #64]

	4157 + sqxtun v1\b_offs\defsize, v28.8h

	4158 + uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */

	4159 + uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */

	4160 + smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */

	4161 + smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */

	4162 + smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */

	4163 + smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */

	4164 + smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */

	4165 + smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */

	4166 +.else /************************** rgb565 *********************************/

	4167 + sqshlu v21.8h, v20.8h, #8

	4168 + sqshlu v25.8h, v24.8h, #8

	4169 + sqshlu v29.8h, v28.8h, #8

	4170 + uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */

	4171 + uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */

	4172 + ld1 {v0.8b}, [Y], 8

	4173 + smull v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */

	4174 + smlal v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */

	4175 + smull2 v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */

	4176 + smlal2 v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */

	4177 + sri v25.8h, v21.8h, #5

	4178 + smull v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */

	4179 + smull2 v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */

	4180 + prfm PLDL1KEEP, [U, #64]

	4181 + prfm PLDL1KEEP, [V, #64]

	4182 + prfm PLDL1KEEP, [Y, #64]

	4183 + sri v25.8h, v29.8h, #11

	4184 +.endif

	4185 + do_store \bpp, 8

	4186 + smull v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */

	4187 + smull2 v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */

	4188 +.endm

	4189 +

	4190 +.macro do_yuv_to_rgb

	4191 + do_yuv_to_rgb_stage1

	4192 + do_yuv_to_rgb_stage2

	4193 +.endm

	4194 +

	4195 +/* Apple gas crashes on adrl, work around that by using adr.

	4196 + * But this requires a copy of these constants for each function.

	4197 + */

	4198 +

	4199 +.balign 16

	4200 +jsimd_ycc_\colorid\()_neon_consts:

	4201 + .short 0, 0, 0, 0

	4202 + .short 22971, -11277, -23401, 29033

	4203 + .short -128, -128, -128, -128

	4204 + .short -128, -128, -128, -128

	4205 +

	4206 +asm_function jsimd_ycc_\colorid\()_convert_neon

	4207 + OUTPUT_WIDTH .req x0

	4208 + INPUT_BUF .req x1

	4209 + INPUT_ROW .req x2

	4210 + OUTPUT_BUF .req x3

	4211 + NUM_ROWS .req x4

	4212 +

	4213 + INPUT_BUF0 .req x5

	4214 + INPUT_BUF1 .req x6

	4215 + INPUT_BUF2 .req INPUT_BUF

	4216 +

	4217 + RGB .req x7

	4218 + Y .req x8

	4219 + U .req x9

	4220 + V .req x10

	4221 + N .req x15

	4222 +

	4223 + sub sp, sp, 336

	4224 + str x15, [sp], 16

	4225 + /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */

	4226 + adr x15, jsimd_ycc_\colorid\()_neon_consts

	4227 + /* Save NEON registers */

	4228 + st1 {v0.8b - v3.8b}, [sp], 32

	4229 + st1 {v4.8b - v7.8b}, [sp], 32

	4230 + st1 {v8.8b - v11.8b}, [sp], 32

	4231 + st1 {v12.8b - v15.8b}, [sp], 32

	4232 + st1 {v16.8b - v19.8b}, [sp], 32

	4233 + st1 {v20.8b - v23.8b}, [sp], 32

	4234 + st1 {v24.8b - v27.8b}, [sp], 32

	4235 + st1 {v28.8b - v31.8b}, [sp], 32

	4236 + ld1 {v0.4h, v1.4h}, [x15], 16

	4237 + ld1 {v2.8h}, [x15]

	4238 +

	4239 + /* Save ARM registers and handle input arguments */

	4240 + /* push {x4, x5, x6, x7, x8, x9, x10, x30} */

	4241 + stp x4, x5, [sp], 16

	4242 + stp x6, x7, [sp], 16

	4243 + stp x8, x9, [sp], 16

	4244 + stp x10, x30, [sp], 16

	4245 + ldr INPUT_BUF0, [INPUT_BUF]

	4246 + ldr INPUT_BUF1, [INPUT_BUF, 8]

	4247 + ldr INPUT_BUF2, [INPUT_BUF, 16]

	4248 + .unreq INPUT_BUF

	4249 +

	4250 + /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */

	4251 + movi v10.16b, #255

	4252 + movi v13.16b, #255

	4253 +

	4254 + /* Outer loop over scanlines */

	4255 + cmp NUM_ROWS, #1

	4256 + blt 9f

	4257 +0:

	4258 + lsl x16, INPUT_ROW, #3

	4259 + ldr Y, [INPUT_BUF0, x16]

	4260 + ldr U, [INPUT_BUF1, x16]

	4261 + mov N, OUTPUT_WIDTH

	4262 + ldr V, [INPUT_BUF2, x16]

	4263 + add INPUT_ROW, INPUT_ROW, #1

	4264 + ldr RGB, [OUTPUT_BUF], #8

	4265 +

	4266 + /* Inner loop over pixels */

	4267 + subs N, N, #8

	4268 + blt 3f

	4269 + do_load 8

	4270 + do_yuv_to_rgb_stage1

	4271 + subs N, N, #8

	4272 + blt 2f

	4273 +1:

	4274 + do_yuv_to_rgb_stage2_store_load_stage1

	4275 + subs N, N, #8

	4276 + bge 1b

	4277 +2:

	4278 + do_yuv_to_rgb_stage2

	4279 + do_store \bpp, 8

	4280 + tst N, #7

	4281 + beq 8f

	4282 +3:

	4283 + tst N, #4

	4284 + beq 3f

	4285 + do_load 4

	4286 +3:

	4287 + tst N, #2

	4288 + beq 4f

	4289 + do_load 2

	4290 +4:

	4291 + tst N, #1

	4292 + beq 5f

	4293 + do_load 1

	4294 +5:

	4295 + do_yuv_to_rgb

	4296 + tst N, #4

	4297 + beq 6f

	4298 + do_store \bpp, 4

	4299 +6:

	4300 + tst N, #2

	4301 + beq 7f

	4302 + do_store \bpp, 2

	4303 +7:

	4304 + tst N, #1

	4305 + beq 8f

	4306 + do_store \bpp, 1

	4307 +8:

	4308 + subs NUM_ROWS, NUM_ROWS, #1

	4309 + bgt 0b

	4310 +9:

	4311 + /* Restore all registers and return */

	4312 + sub sp, sp, #336

	4313 + ldr x15, [sp], 16

	4314 + ld1 {v0.8b - v3.8b}, [sp], 32

	4315 + ld1 {v4.8b - v7.8b}, [sp], 32

	4316 + ld1 {v8.8b - v11.8b}, [sp], 32

	4317 + ld1 {v12.8b - v15.8b}, [sp], 32

	4318 + ld1 {v16.8b - v19.8b}, [sp], 32

	4319 + ld1 {v20.8b - v23.8b}, [sp], 32

	4320 + ld1 {v24.8b - v27.8b}, [sp], 32

	4321 + ld1 {v28.8b - v31.8b}, [sp], 32

	4322 + /* pop {r4, r5, r6, r7, r8, r9, r10, pc} */

	4323 + ldp x4, x5, [sp], 16

	4324 + ldp x6, x7, [sp], 16

	4325 + ldp x8, x9, [sp], 16

	4326 + ldp x10, x30, [sp], 16

	4327 + br x30

	4328 + .unreq OUTPUT_WIDTH

	4329 + .unreq INPUT_ROW

	4330 + .unreq OUTPUT_BUF

	4331 + .unreq NUM_ROWS

	4332 + .unreq INPUT_BUF0

	4333 + .unreq INPUT_BUF1

	4334 + .unreq INPUT_BUF2

	4335 + .unreq RGB

	4336 + .unreq Y

	4337 + .unreq U

	4338 + .unreq V

	4339 + .unreq N

	4340 +

	4341 +.purgem do_yuv_to_rgb

	4342 +.purgem do_yuv_to_rgb_stage1

	4343 +.purgem do_yuv_to_rgb_stage2

	4344 +.purgem do_yuv_to_rgb_stage2_store_load_stage1

	4345 +.endm

	4346 +

	4347 +/--------------------------------- id ----- bpp R rsize G gsize B bsize defsize /

	4348 +generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b

	4349 +generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b

	4350 +generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .8b

	4351 +generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b

	4352 +generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b

	4353 +generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b

	4354 +generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b

	4355 +.purgem do_load

	4356 +.purgem do_store

OLD	NEW

« no previous file with comments | « README.chromium ('k') | libjpeg.gyp » ('j') | no next file with comments »