source/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c - Issue 478033002: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c

Issue 478033002: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 6 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 /*

	2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.

	3 *

	4 * Use of this source code is governed by a BSD-style license

	5 * that can be found in the LICENSE file in the root of the source

	6 * tree. An additional intellectual property rights grant can be found

	7 * in the file PATENTS. All contributing project authors may

	8 * be found in the AUTHORS file in the root of the source tree.

	9 */

	10

	11 #include <arm_neon.h>

	12 #include "./vp9_rtcd.h"

	13 #include "./vpx_config.h"

	14

	15 #include "vp9/common/vp9_blockd.h"

	16 #include "vp9/common/vp9_idct.h"

	17

	18 void vp9_fdct8x8_1_neon(const int16_t input, int16_t output, int stride) {

	19 int r;

	20 int16x8_t sum = vld1q_s16(&input[0]);

	21 for (r = 1; r < 8; ++r) {

	22 const int16x8_t input_00 = vld1q_s16(&input[r * stride]);

	23 sum = vaddq_s16(sum, input_00);

	24 }

	25 {

	26 const int32x4_t a = vpaddlq_s16(sum);

	27 const int64x2_t b = vpaddlq_s32(a);

	28 const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),

	29 vreinterpret_s32_s64(vget_high_s64(b)));

	30 output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0);

	31 output[1] = 0;

	32 }

	33 }

	34

	35 void vp9_fdct8x8_neon(const int16_t input, int16_t final_output, int stride) {

	36 int i;

	37 // stage 1

	38 int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);

	39 int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);

	40 int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);

	41 int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);

	42 int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);

	43 int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);

	44 int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);

	45 int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);

	46 for (i = 0; i < 2; ++i) {

	47 int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7;

	48 const int16x8_t v_s0 = vaddq_s16(input_0, input_7);

	49 const int16x8_t v_s1 = vaddq_s16(input_1, input_6);

	50 const int16x8_t v_s2 = vaddq_s16(input_2, input_5);

	51 const int16x8_t v_s3 = vaddq_s16(input_3, input_4);

	52 const int16x8_t v_s4 = vsubq_s16(input_3, input_4);

	53 const int16x8_t v_s5 = vsubq_s16(input_2, input_5);

	54 const int16x8_t v_s6 = vsubq_s16(input_1, input_6);

	55 const int16x8_t v_s7 = vsubq_s16(input_0, input_7);

	56 // fdct4(step, step);

	57 int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);

	58 int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);

	59 int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);

	60 int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);

	61 // fdct4(step, step);

	62 int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));

	63 int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));

	64 int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));

	65 int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));

	66 int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64);

	67 int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64);

	68 int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64);

	69 int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64);

	70 v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64);

	71 v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);

	72 v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);

	73 v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);

	74 v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);

	75 v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);

	76 v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);

	77 v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);

	78 {

	79 const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);

	80 const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);

	81 const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);

	82 const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);

	83 const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);

	84 const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);

	85 const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);

	86 const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);

	87 out_0 = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43

	88 out_2 = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63

	89 out_4 = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47

	90 out_6 = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67

	91 }

	92 // Stage 2

	93 v_x0 = vsubq_s16(v_s6, v_s5);

	94 v_x1 = vaddq_s16(v_s6, v_s5);

	95 v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64);

	96 v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64);

	97 v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64);

	98 v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64);

	99 {

	100 const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);

	101 const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);

	102 const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);

	103 const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);

	104 const int16x8_t ab = vcombine_s16(a, b);

	105 const int16x8_t cd = vcombine_s16(c, d);

	106 // Stage 3

	107 v_x0 = vaddq_s16(v_s4, ab);

	108 v_x1 = vsubq_s16(v_s4, ab);

	109 v_x2 = vsubq_s16(v_s7, cd);

	110 v_x3 = vaddq_s16(v_s7, cd);

	111 }

	112 // Stage 4

	113 v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64);

	114 v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64);

	115 v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64);

	116 v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64);

	117 v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64);

	118 v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64);

	119 v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64);

	120 v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64);

	121 v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64);

	122 v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64);

	123 v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64);

	124 v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64);

	125 v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64);

	126 v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64);

	127 v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64);

	128 v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64);

	129 {

	130 const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);

	131 const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);

	132 const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);

	133 const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);

	134 const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);

	135 const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);

	136 const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);

	137 const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);

	138 out_1 = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53

	139 out_3 = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73

	140 out_5 = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57

	141 out_7 = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77

	142 }

	143 // transpose 8x8

	144 {

	145 // 00 01 02 03 40 41 42 43

	146 // 10 11 12 13 50 51 52 53

	147 // 20 21 22 23 60 61 62 63

	148 // 30 31 32 33 70 71 72 73

	149 // 04 05 06 07 44 45 46 47

	150 // 14 15 16 17 54 55 56 57

	151 // 24 25 26 27 64 65 66 67

	152 // 34 35 36 37 74 75 76 77

	153 const int32x4x2_t r02_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_0),

	154 vreinterpretq_s32_s16(out_2));

	155 const int32x4x2_t r13_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_1),

	156 vreinterpretq_s32_s16(out_3));

	157 const int32x4x2_t r46_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_4),

	158 vreinterpretq_s32_s16(out_6));

	159 const int32x4x2_t r57_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_5),

	160 vreinterpretq_s32_s16(out_7));

	161 const int16x8x2_t r01_s16 =

	162 vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),

	163 vreinterpretq_s16_s32(r13_s32.val[0]));

	164 const int16x8x2_t r23_s16 =

	165 vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),

	166 vreinterpretq_s16_s32(r13_s32.val[1]));

	167 const int16x8x2_t r45_s16 =

	168 vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),

	169 vreinterpretq_s16_s32(r57_s32.val[0]));

	170 const int16x8x2_t r67_s16 =

	171 vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),

	172 vreinterpretq_s16_s32(r57_s32.val[1]));

	173 input_0 = r01_s16.val[0];

	174 input_1 = r01_s16.val[1];

	175 input_2 = r23_s16.val[0];

	176 input_3 = r23_s16.val[1];

	177 input_4 = r45_s16.val[0];

	178 input_5 = r45_s16.val[1];

	179 input_6 = r67_s16.val[0];

	180 input_7 = r67_s16.val[1];

	181 // 00 10 20 30 40 50 60 70

	182 // 01 11 21 31 41 51 61 71

	183 // 02 12 22 32 42 52 62 72

	184 // 03 13 23 33 43 53 63 73

	185 // 04 14 24 34 44 54 64 74

	186 // 05 15 25 35 45 55 65 75

	187 // 06 16 26 36 46 56 66 76

	188 // 07 17 27 37 47 57 67 77

	189 }

	190 } // for

	191 {

	192 // from vp9_dct_sse2.c

	193 // Post-condition (division by two)

	194 // division of two 16 bits signed numbers using shifts

	195 // n / 2 = (n - (n >> 15)) >> 1

	196 const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15);

	197 const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15);

	198 const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15);

	199 const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15);

	200 const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15);

	201 const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15);

	202 const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15);

	203 const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15);

	204 input_0 = vhsubq_s16(input_0, sign_in0);

	205 input_1 = vhsubq_s16(input_1, sign_in1);

	206 input_2 = vhsubq_s16(input_2, sign_in2);

	207 input_3 = vhsubq_s16(input_3, sign_in3);

	208 input_4 = vhsubq_s16(input_4, sign_in4);

	209 input_5 = vhsubq_s16(input_5, sign_in5);

	210 input_6 = vhsubq_s16(input_6, sign_in6);

	211 input_7 = vhsubq_s16(input_7, sign_in7);

	212 // store results

	213 vst1q_s16(&final_output[0 * 8], input_0);

	214 vst1q_s16(&final_output[1 * 8], input_1);

	215 vst1q_s16(&final_output[2 * 8], input_2);

	216 vst1q_s16(&final_output[3 * 8], input_3);

	217 vst1q_s16(&final_output[4 * 8], input_4);

	218 vst1q_s16(&final_output[5 * 8], input_5);

	219 vst1q_s16(&final_output[6 * 8], input_6);

	220 vst1q_s16(&final_output[7 * 8], input_7);

	221 }

	222 }

	223

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/decoder/vp9_reader.h ('k') | source/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c » ('j') | no next file with comments »