source/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon_asm.asm - Issue 812033011: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon_asm.asm

Issue 812033011: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« no previous file with comments | « source/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c ('k') | source/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 ;

	2 ; Copyright (c) 2013 The WebM project authors. All Rights Reserved.

	3 ;

	4 ; Use of this source code is governed by a BSD-style license and patent

	5 ; grant that can be found in the LICENSE file in the root of the source

	6 ; tree. All contributing project authors may be found in the AUTHORS

	7 ; file in the root of the source tree.

	8 ;

	9

	10 EXPORT \|vp9_idct32x32_1_add_neon\|

	11 ARM

	12 REQUIRE8

	13 PRESERVE8

	14

	15 AREA \|\|.text\|\|, CODE, READONLY, ALIGN=2

	16

	17 ;TODO(hkuang): put the following macros in a seperate

	18 ;file so other idct function could also use them.

	19 MACRO

	20 LD_16x8 $src, $stride

	21 vld1.8 {q8}, [$src], $stride

	22 vld1.8 {q9}, [$src], $stride

	23 vld1.8 {q10}, [$src], $stride

	24 vld1.8 {q11}, [$src], $stride

	25 vld1.8 {q12}, [$src], $stride

	26 vld1.8 {q13}, [$src], $stride

	27 vld1.8 {q14}, [$src], $stride

	28 vld1.8 {q15}, [$src], $stride

	29 MEND

	30

	31 MACRO

	32 ADD_DIFF_16x8 $diff

	33 vqadd.u8 q8, q8, $diff

	34 vqadd.u8 q9, q9, $diff

	35 vqadd.u8 q10, q10, $diff

	36 vqadd.u8 q11, q11, $diff

	37 vqadd.u8 q12, q12, $diff

	38 vqadd.u8 q13, q13, $diff

	39 vqadd.u8 q14, q14, $diff

	40 vqadd.u8 q15, q15, $diff

	41 MEND

	42

	43 MACRO

	44 SUB_DIFF_16x8 $diff

	45 vqsub.u8 q8, q8, $diff

	46 vqsub.u8 q9, q9, $diff

	47 vqsub.u8 q10, q10, $diff

	48 vqsub.u8 q11, q11, $diff

	49 vqsub.u8 q12, q12, $diff

	50 vqsub.u8 q13, q13, $diff

	51 vqsub.u8 q14, q14, $diff

	52 vqsub.u8 q15, q15, $diff

	53 MEND

	54

	55 MACRO

	56 ST_16x8 $dst, $stride

	57 vst1.8 {q8}, [$dst], $stride

	58 vst1.8 {q9}, [$dst], $stride

	59 vst1.8 {q10},[$dst], $stride

	60 vst1.8 {q11},[$dst], $stride

	61 vst1.8 {q12},[$dst], $stride

	62 vst1.8 {q13},[$dst], $stride

	63 vst1.8 {q14},[$dst], $stride

	64 vst1.8 {q15},[$dst], $stride

	65 MEND

	66

	67 ;void vp9_idct32x32_1_add_neon(int16_t input, uint8_t dest,

	68 ; int dest_stride)

	69 ;

	70 ; r0 int16_t input

	71 ; r1 uint8_t *dest

	72 ; r2 int dest_stride

	73

	74 \|vp9_idct32x32_1_add_neon\| PROC

	75 push {lr}

	76 pld [r1]

	77 add r3, r1, #16 ; r3 dest + 16 for second loop

	78 ldrsh r0, [r0]

	79

	80 ; generate cospi_16_64 = 11585

	81 mov r12, #0x2d00

	82 add r12, #0x41

	83

	84 ; out = dct_const_round_shift(input[0] * cospi_16_64)

	85 mul r0, r0, r12 ; input[0] * cospi_16_64

	86 add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))

	87 asr r0, r0, #14 ; >> DCT_CONST_BITS

	88

	89 ; out = dct_const_round_shift(out * cospi_16_64)

	90 mul r0, r0, r12 ; out * cospi_16_64

	91 mov r12, r1 ; save dest

	92 add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))

	93 asr r0, r0, #14 ; >> DCT_CONST_BITS

	94

	95 ; a1 = ROUND_POWER_OF_TWO(out, 6)

	96 add r0, r0, #32 ; + (1 <<((6) - 1))

	97 asrs r0, r0, #6 ; >> 6

	98 bge diff_positive_32_32

	99

	100 diff_negative_32_32

	101 neg r0, r0

	102 usat r0, #8, r0

	103 vdup.u8 q0, r0

	104 mov r0, #4

	105

	106 diff_negative_32_32_loop

	107 sub r0, #1

	108 LD_16x8 r1, r2

	109 SUB_DIFF_16x8 q0

	110 ST_16x8 r12, r2

	111

	112 LD_16x8 r1, r2

	113 SUB_DIFF_16x8 q0

	114 ST_16x8 r12, r2

	115 cmp r0, #2

	116 moveq r1, r3

	117 moveq r12, r3

	118 cmp r0, #0

	119 bne diff_negative_32_32_loop

	120 pop {pc}

	121

	122 diff_positive_32_32

	123 usat r0, #8, r0

	124 vdup.u8 q0, r0

	125 mov r0, #4

	126

	127 diff_positive_32_32_loop

	128 sub r0, #1

	129 LD_16x8 r1, r2

	130 ADD_DIFF_16x8 q0

	131 ST_16x8 r12, r2

	132

	133 LD_16x8 r1, r2

	134 ADD_DIFF_16x8 q0

	135 ST_16x8 r12, r2

	136 cmp r0, #2

	137 moveq r1, r3

	138 moveq r12, r3

	139 cmp r0, #0

	140 bne diff_positive_32_32_loop

	141 pop {pc}

	142

	143 ENDP ; \|vp9_idct32x32_1_add_neon\|

	144 END

OLD	NEW