source/libvpx/vp9/encoder/arm/neon/vp9_avg_neon.c - Issue 1302353004: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/encoder/arm/neon/vp9_avg_neon.c

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: Created 5 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #include <arm_neon.h>	11 #include <arm_neon.h>

	12 #include <assert.h>

	13

12 #include "./vp9_rtcd.h"	14 #include "./vp9_rtcd.h"

13 #include "./vpx_config.h"	15 #include "./vpx_config.h"

14	16

15 #include "vpx/vpx_integer.h"	17 #include "vpx/vpx_integer.h"

16	18

17 static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {	19 static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {

18 const uint32x4_t a = vpaddlq_u16(v_16x8);	20 const uint32x4_t a = vpaddlq_u16(v_16x8);

19 const uint64x2_t b = vpaddlq_u32(a);	21 const uint64x2_t b = vpaddlq_u32(a);

20 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),	22 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),

21 vreinterpret_u32_u64(vget_high_u64(b)));	23 vreinterpret_u32_u64(vget_high_u64(b)));

(...skipping 85 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
107	109

108 for (i = 0; i < width; i += 16) {	110 for (i = 0; i < width; i += 16) {

109 const uint8x16_t vec_row = vld1q_u8(ref);	111 const uint8x16_t vec_row = vld1q_u8(ref);

110 vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row));	112 vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row));

111 vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row));	113 vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row));

112 ref += 16;	114 ref += 16;

113 }	115 }

114	116

115 return horizontal_add_u16x8(vec_sum);	117 return horizontal_add_u16x8(vec_sum);

116 }	118 }

	119

	120 // ref, src = [0, 510] - max diff = 16-bits

	121 // bwl = {2, 3, 4}, width = {16, 32, 64}

	122 int vp9_vector_var_neon(int16_t const ref, int16_t const src, const int bwl) {

	123 int width = 4 << bwl;

	124 int32x4_t sse = vdupq_n_s32(0);

	125 int16x8_t total = vdupq_n_s16(0);

	126

	127 assert(width >= 8);

	128 assert((width % 8) == 0);

	129

	130 do {

	131 const int16x8_t r = vld1q_s16(ref);

	132 const int16x8_t s = vld1q_s16(src);

	133 const int16x8_t diff = vsubq_s16(r, s); // [-510, 510], 10 bits.

	134 const int16x4_t diff_lo = vget_low_s16(diff);

	135 const int16x4_t diff_hi = vget_high_s16(diff);

	136 sse = vmlal_s16(sse, diff_lo, diff_lo); // dynamic range 26 bits.

	137 sse = vmlal_s16(sse, diff_hi, diff_hi);

	138 total = vaddq_s16(total, diff); // dynamic range 16 bits.

	139

	140 ref += 8;

	141 src += 8;

	142 width -= 8;

	143 } while (width != 0);

	144

	145 {

	146 // Note: 'total''s pairwise addition could be implemented similarly to

	147 // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired

	148 // with the summation of 'sse' performed better on a Cortex-A15.

	149 const int32x4_t t0 = vpaddlq_s16(total); // cascading summation of 'total'

	150 const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0));

	151 const int32x2_t t2 = vpadd_s32(t1, t1);

	152 const int t = vget_lane_s32(t2, 0);

	153 const int64x2_t s0 = vpaddlq_s32(sse); // cascading summation of 'sse'.

	154 const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),

	155 vreinterpret_s32_s64(vget_high_s64(s0)));

	156 const int s = vget_lane_s32(s1, 0);

	157 const int shift_factor = bwl + 2;

	158 return s - ((t * t) >> shift_factor);

	159 }

	160 }

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/decoder/vp9_decoder.c ('k') | source/libvpx/vp9/encoder/vp9_aq_complexity.c » ('j') | no next file with comments »