source/libvpx/vp8/common/arm/neon/dequant_idct_neon.c - Issue 181493009: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp8/common/arm/neon/dequant_idct_neon.c

Issue 181493009: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 6 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 /*

	2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.

	3 *

	4 * Use of this source code is governed by a BSD-style license

	5 * that can be found in the LICENSE file in the root of the source

	6 * tree. An additional intellectual property rights grant can be found

	7 * in the file PATENTS. All contributing project authors may

	8 * be found in the AUTHORS file in the root of the source tree.

	9 */

	10

	11 #include <arm_neon.h>

	12

	13 static const int16_t cospi8sqrt2minus1 = 20091;

	14 static const int16_t sinpi8sqrt2 = 35468;

	15

	16 void vp8_dequant_idct_add_neon(

	17 int16_t *input,

	18 int16_t *dq,

	19 unsigned char *dst,

	20 int stride) {

	21 unsigned char *dst0;

	22 int32x2_t d14, d15;

	23 int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;

	24 int16x8_t q1, q2, q3, q4, q5, q6;

	25 int16x8_t qEmpty = vdupq_n_s16(0);

	26 int32x2x2_t d2tmp0, d2tmp1;

	27 int16x4x2_t d2tmp2, d2tmp3;

	28

	29 d14 = d15 = vdup_n_s32(0);

	30

	31 // load input

	32 q3 = vld1q_s16(input);

	33 vst1q_s16(input, qEmpty);

	34 input += 8;

	35 q4 = vld1q_s16(input);

	36 vst1q_s16(input, qEmpty);

	37

	38 // load dq

	39 q5 = vld1q_s16(dq);

	40 dq += 8;

	41 q6 = vld1q_s16(dq);

	42

	43 // load src from dst

	44 dst0 = dst;

	45 d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0);

	46 dst0 += stride;

	47 d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1);

	48 dst0 += stride;

	49 d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0);

	50 dst0 += stride;

	51 d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1);

	52

	53 q1 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q3),

	54 vreinterpretq_u16_s16(q5)));

	55 q2 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q4),

	56 vreinterpretq_u16_s16(q6)));

	57

	58 d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2));

	59 d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2));

	60

	61 q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2));

	62

	63 q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);

	64 q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);

	65

	66 q3 = vshrq_n_s16(q3, 1);

	67 q4 = vshrq_n_s16(q4, 1);

	68

	69 q3 = vqaddq_s16(q3, q2);

	70 q4 = vqaddq_s16(q4, q2);

	71

	72 d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));

	73 d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));

	74

	75 d2 = vqadd_s16(d12, d11);

	76 d3 = vqadd_s16(d13, d10);

	77 d4 = vqsub_s16(d13, d10);

	78 d5 = vqsub_s16(d12, d11);

	79

	80 d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));

	81 d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));

	82 d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),

	83 vreinterpret_s16_s32(d2tmp1.val[0]));

	84 d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),

	85 vreinterpret_s16_s32(d2tmp1.val[1]));

	86

	87 // loop 2

	88 q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]);

	89

	90 q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);

	91 q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);

	92

	93 d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]);

	94 d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]);

	95

	96 q3 = vshrq_n_s16(q3, 1);

	97 q4 = vshrq_n_s16(q4, 1);

	98

	99 q3 = vqaddq_s16(q3, q2);

	100 q4 = vqaddq_s16(q4, q2);

	101

	102 d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));

	103 d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));

	104

	105 d2 = vqadd_s16(d12, d11);

	106 d3 = vqadd_s16(d13, d10);

	107 d4 = vqsub_s16(d13, d10);

	108 d5 = vqsub_s16(d12, d11);

	109

	110 d2 = vrshr_n_s16(d2, 3);

	111 d3 = vrshr_n_s16(d3, 3);

	112 d4 = vrshr_n_s16(d4, 3);

	113 d5 = vrshr_n_s16(d5, 3);

	114

	115 d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));

	116 d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));

	117 d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),

	118 vreinterpret_s16_s32(d2tmp1.val[0]));

	119 d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),

	120 vreinterpret_s16_s32(d2tmp1.val[1]));

	121

	122 q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]);

	123 q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]);

	124

	125 q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1),

	126 vreinterpret_u8_s32(d14)));

	127 q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2),

	128 vreinterpret_u8_s32(d15)));

	129

	130 d14 = vreinterpret_s32_u8(vqmovun_s16(q1));

	131 d15 = vreinterpret_s32_u8(vqmovun_s16(q2));

	132

	133 dst0 = dst;

	134 vst1_lane_s32((int32_t *)dst0, d14, 0);

	135 dst0 += stride;

	136 vst1_lane_s32((int32_t *)dst0, d14, 1);

	137 dst0 += stride;

	138 vst1_lane_s32((int32_t *)dst0, d15, 0);

	139 dst0 += stride;

	140 vst1_lane_s32((int32_t *)dst0, d15, 1);

	141 return;

	142 }

OLD	NEW

« no previous file with comments | « source/libvpx/vp8/common/arm/neon/dequant_idct_neon.asm ('k') | source/libvpx/vp8/common/arm/neon/dequantizeb_neon.asm » ('j') | no next file with comments »