| Index: simd/jsimd_arm_neon.S
|
| ===================================================================
|
| --- simd/jsimd_arm_neon.S (revision 134206)
|
| +++ simd/jsimd_arm_neon.S (working copy)
|
| @@ -2157,3 +2157,241 @@
|
| .unreq SHIFT
|
| .unreq LOOP_COUNT
|
| .endfunc
|
| +
|
| +/*****************************************************************************/
|
| +
|
| +/*
|
| + * GLOBAL(void)
|
| + * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor,
|
| + * JDIMENSION downsampled_width,
|
| + * JSAMPARRAY input_data,
|
| + * JSAMPARRAY * output_data_ptr);
|
| + *
|
| + * Note: the use of unaligned writes is the main remaining bottleneck in
|
| + * this code, which can be potentially solved to get up to tens
|
| + * of percents performance improvement on Cortex-A8/Cortex-A9.
|
| + */
|
| +
|
| +/*
|
| + * Upsample 16 source pixels to 32 destination pixels. The new 16 source
|
| + * pixels are loaded to q0. The previous 16 source pixels are in q1. The
|
| + * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
|
| + * Register d28 is used for multiplication by 3. Register q15 is used
|
| + * for adding +1 bias.
|
| + */
|
| +.macro upsample16 OUTPTR, INPTR
|
| + vld1.8 {q0}, [\INPTR]!
|
| + vmovl.u8 q8, d0
|
| + vext.8 q2, q1, q0, #15
|
| + vmovl.u8 q9, d1
|
| + vaddw.u8 q10, q15, d4
|
| + vaddw.u8 q11, q15, d5
|
| + vmlal.u8 q8, d4, d28
|
| + vmlal.u8 q9, d5, d28
|
| + vmlal.u8 q10, d0, d28
|
| + vmlal.u8 q11, d1, d28
|
| + vmov q1, q0 /* backup source pixels to q1 */
|
| + vrshrn.u16 d6, q8, #2
|
| + vrshrn.u16 d7, q9, #2
|
| + vshrn.u16 d8, q10, #2
|
| + vshrn.u16 d9, q11, #2
|
| + vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
|
| +.endm
|
| +
|
| +/*
|
| + * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
|
| + * macro, the roles of q0 and q1 registers are reversed for even and odd
|
| + * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
|
| + * Also this unrolling allows to reorder loads and stores to compensate
|
| + * multiplication latency and reduce stalls.
|
| + */
|
| +.macro upsample32 OUTPTR, INPTR
|
| + /* even 16 pixels group */
|
| + vld1.8 {q0}, [\INPTR]!
|
| + vmovl.u8 q8, d0
|
| + vext.8 q2, q1, q0, #15
|
| + vmovl.u8 q9, d1
|
| + vaddw.u8 q10, q15, d4
|
| + vaddw.u8 q11, q15, d5
|
| + vmlal.u8 q8, d4, d28
|
| + vmlal.u8 q9, d5, d28
|
| + vmlal.u8 q10, d0, d28
|
| + vmlal.u8 q11, d1, d28
|
| + /* odd 16 pixels group */
|
| + vld1.8 {q1}, [\INPTR]!
|
| + vrshrn.u16 d6, q8, #2
|
| + vrshrn.u16 d7, q9, #2
|
| + vshrn.u16 d8, q10, #2
|
| + vshrn.u16 d9, q11, #2
|
| + vmovl.u8 q8, d2
|
| + vext.8 q2, q0, q1, #15
|
| + vmovl.u8 q9, d3
|
| + vaddw.u8 q10, q15, d4
|
| + vaddw.u8 q11, q15, d5
|
| + vmlal.u8 q8, d4, d28
|
| + vmlal.u8 q9, d5, d28
|
| + vmlal.u8 q10, d2, d28
|
| + vmlal.u8 q11, d3, d28
|
| + vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
|
| + vrshrn.u16 d6, q8, #2
|
| + vrshrn.u16 d7, q9, #2
|
| + vshrn.u16 d8, q10, #2
|
| + vshrn.u16 d9, q11, #2
|
| + vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
|
| +.endm
|
| +
|
| +/*
|
| + * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
|
| + */
|
| +.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
|
| + /* special case for the first and last pixels */
|
| + sub \WIDTH, \WIDTH, #1
|
| + add \OUTPTR, \OUTPTR, #1
|
| + ldrb \TMP1, [\INPTR, \WIDTH]
|
| + strb \TMP1, [\OUTPTR, \WIDTH, asl #1]
|
| + ldrb \TMP1, [\INPTR], #1
|
| + strb \TMP1, [\OUTPTR, #-1]
|
| + vmov.8 d3[7], \TMP1
|
| +
|
| + subs \WIDTH, \WIDTH, #32
|
| + blt 5f
|
| +0: /* process 32 pixels per iteration */
|
| + upsample32 \OUTPTR, \INPTR
|
| + subs \WIDTH, \WIDTH, #32
|
| + bge 0b
|
| +5:
|
| + adds \WIDTH, \WIDTH, #16
|
| + blt 1f
|
| +0: /* process 16 pixels if needed */
|
| + upsample16 \OUTPTR, \INPTR
|
| + subs \WIDTH, \WIDTH, #16
|
| +1:
|
| + adds \WIDTH, \WIDTH, #16
|
| + beq 9f
|
| +
|
| + /* load the remaining 1-15 pixels */
|
| + add \INPTR, \INPTR, \WIDTH
|
| + tst \WIDTH, #1
|
| + beq 2f
|
| + sub \INPTR, \INPTR, #1
|
| + vld1.8 {d0[0]}, [\INPTR]
|
| +2:
|
| + tst \WIDTH, #2
|
| + beq 2f
|
| + vext.8 d0, d0, d0, #6
|
| + sub \INPTR, \INPTR, #1
|
| + vld1.8 {d0[1]}, [\INPTR]
|
| + sub \INPTR, \INPTR, #1
|
| + vld1.8 {d0[0]}, [\INPTR]
|
| +2:
|
| + tst \WIDTH, #4
|
| + beq 2f
|
| + vrev64.32 d0, d0
|
| + sub \INPTR, \INPTR, #1
|
| + vld1.8 {d0[3]}, [\INPTR]
|
| + sub \INPTR, \INPTR, #1
|
| + vld1.8 {d0[2]}, [\INPTR]
|
| + sub \INPTR, \INPTR, #1
|
| + vld1.8 {d0[1]}, [\INPTR]
|
| + sub \INPTR, \INPTR, #1
|
| + vld1.8 {d0[0]}, [\INPTR]
|
| +2:
|
| + tst \WIDTH, #8
|
| + beq 2f
|
| + vmov d1, d0
|
| + sub \INPTR, \INPTR, #8
|
| + vld1.8 {d0}, [\INPTR]
|
| +2: /* upsample the remaining pixels */
|
| + vmovl.u8 q8, d0
|
| + vext.8 q2, q1, q0, #15
|
| + vmovl.u8 q9, d1
|
| + vaddw.u8 q10, q15, d4
|
| + vaddw.u8 q11, q15, d5
|
| + vmlal.u8 q8, d4, d28
|
| + vmlal.u8 q9, d5, d28
|
| + vmlal.u8 q10, d0, d28
|
| + vmlal.u8 q11, d1, d28
|
| + vrshrn.u16 d10, q8, #2
|
| + vrshrn.u16 d12, q9, #2
|
| + vshrn.u16 d11, q10, #2
|
| + vshrn.u16 d13, q11, #2
|
| + vzip.8 d10, d11
|
| + vzip.8 d12, d13
|
| + /* store the remaining pixels */
|
| + tst \WIDTH, #8
|
| + beq 2f
|
| + vst1.8 {d10, d11}, [\OUTPTR]!
|
| + vmov q5, q6
|
| +2:
|
| + tst \WIDTH, #4
|
| + beq 2f
|
| + vst1.8 {d10}, [\OUTPTR]!
|
| + vmov d10, d11
|
| +2:
|
| + tst \WIDTH, #2
|
| + beq 2f
|
| + vst1.8 {d10[0]}, [\OUTPTR]!
|
| + vst1.8 {d10[1]}, [\OUTPTR]!
|
| + vst1.8 {d10[2]}, [\OUTPTR]!
|
| + vst1.8 {d10[3]}, [\OUTPTR]!
|
| + vext.8 d10, d10, d10, #4
|
| +2:
|
| + tst \WIDTH, #1
|
| + beq 2f
|
| + vst1.8 {d10[0]}, [\OUTPTR]!
|
| + vst1.8 {d10[1]}, [\OUTPTR]!
|
| +2:
|
| +9:
|
| +.endm
|
| +
|
| +asm_function jsimd_h2v1_fancy_upsample_neon
|
| +
|
| + MAX_V_SAMP_FACTOR .req r0
|
| + DOWNSAMPLED_WIDTH .req r1
|
| + INPUT_DATA .req r2
|
| + OUTPUT_DATA_PTR .req r3
|
| + OUTPUT_DATA .req OUTPUT_DATA_PTR
|
| +
|
| + OUTPTR .req r4
|
| + INPTR .req r5
|
| + WIDTH .req ip
|
| + TMP .req lr
|
| +
|
| + push {r4, r5, r6, lr}
|
| + vpush {d8-d15}
|
| +
|
| + ldr OUTPUT_DATA, [OUTPUT_DATA_PTR]
|
| + cmp MAX_V_SAMP_FACTOR, #0
|
| + ble 99f
|
| +
|
| + /* initialize constants */
|
| + vmov.u8 d28, #3
|
| + vmov.u16 q15, #1
|
| +11:
|
| + ldr INPTR, [INPUT_DATA], #4
|
| + ldr OUTPTR, [OUTPUT_DATA], #4
|
| + mov WIDTH, DOWNSAMPLED_WIDTH
|
| + upsample_row OUTPTR, INPTR, WIDTH, TMP
|
| + subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
|
| + bgt 11b
|
| +
|
| +99:
|
| + vpop {d8-d15}
|
| + pop {r4, r5, r6, pc}
|
| +
|
| + .unreq MAX_V_SAMP_FACTOR
|
| + .unreq DOWNSAMPLED_WIDTH
|
| + .unreq INPUT_DATA
|
| + .unreq OUTPUT_DATA_PTR
|
| + .unreq OUTPUT_DATA
|
| +
|
| + .unreq OUTPTR
|
| + .unreq INPTR
|
| + .unreq WIDTH
|
| + .unreq TMP
|
| +
|
| +.endfunc
|
| +
|
| +.purgem upsample16
|
| +.purgem upsample32
|
| +.purgem upsample_row
|
|
|