source/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.c - Issue 812033011: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.c

Issue 812033011: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 /*

	2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.

	3 *

	4 * Use of this source code is governed by a BSD-style license

	5 * that can be found in the LICENSE file in the root of the source

	6 * tree. An additional intellectual property rights grant can be found

	7 * in the file PATENTS. All contributing project authors may

	8 * be found in the AUTHORS file in the root of the source tree.

	9 */

	10

	11 #include <stddef.h>

	12 #include <arm_neon.h>

	13

	14 #include "./vpx_config.h"

	15 #include "vpx_ports/mem.h"

	16

	17 void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,

	18 uint8_t *dst, ptrdiff_t dst_stride,

	19 const int16_t *filter_x, int x_step_q4,

	20 const int16_t *filter_y, int y_step_q4,

	21 int w, int h);

	22 void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,

	23 uint8_t *dst, ptrdiff_t dst_stride,

	24 const int16_t *filter_x, int x_step_q4,

	25 const int16_t *filter_y, int y_step_q4,

	26 int w, int h);

	27

	28 static INLINE int32x4_t MULTIPLY_BY_Q0(

	29 int16x4_t dsrc0,

	30 int16x4_t dsrc1,

	31 int16x4_t dsrc2,

	32 int16x4_t dsrc3,

	33 int16x4_t dsrc4,

	34 int16x4_t dsrc5,

	35 int16x4_t dsrc6,

	36 int16x4_t dsrc7,

	37 int16x8_t q0s16) {

	38 int32x4_t qdst;

	39 int16x4_t d0s16, d1s16;

	40

	41 d0s16 = vget_low_s16(q0s16);

	42 d1s16 = vget_high_s16(q0s16);

	43

	44 qdst = vmull_lane_s16(dsrc0, d0s16, 0);

	45 qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);

	46 qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);

	47 qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);

	48 qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);

	49 qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);

	50 qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);

	51 qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);

	52 return qdst;

	53 }

	54

	55 void vp9_convolve8_horiz_neon(

	56 uint8_t *src,

	57 ptrdiff_t src_stride,

	58 uint8_t *dst,

	59 ptrdiff_t dst_stride,

	60 const int16_t *filter_x,

	61 int x_step_q4,

	62 const int16_t *filter_y, // unused

	63 int y_step_q4, // unused

	64 int w,

	65 int h) {

	66 int width;

	67 uint8_t s, d, psrc, pdst;

	68 uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;

	69 uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32;

	70 uint8x16_t q12u8, q13u8, q14u8, q15u8;

	71 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;

	72 int16x4_t d24s16, d25s16, d26s16, d27s16;

	73 uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;

	74 int16x8_t q0s16;

	75 uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;

	76 int32x4_t q1s32, q2s32, q14s32, q15s32;

	77 uint16x8x2_t q0x2u16;

	78 uint8x8x2_t d0x2u8, d1x2u8;

	79 uint32x2x2_t d0x2u32;

	80 uint16x4x2_t d0x2u16, d1x2u16;

	81 uint32x4x2_t q0x2u32;

	82

	83 if (x_step_q4 != 16) {

	84 vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,

	85 filter_x, x_step_q4,

	86 filter_y, y_step_q4, w, h);

	87 return;

	88 }

	89

	90 q0s16 = vld1q_s16(filter_x);

	91

	92 src -= 3; // adjust for taps

	93 for (; h > 0; h -= 4,

	94 src += src_stride * 4,

	95 dst += dst_stride * 4) { // loop_horiz_v

	96 s = src;

	97 d24u8 = vld1_u8(s);

	98 s += src_stride;

	99 d25u8 = vld1_u8(s);

	100 s += src_stride;

	101 d26u8 = vld1_u8(s);

	102 s += src_stride;

	103 d27u8 = vld1_u8(s);

	104

	105 q12u8 = vcombine_u8(d24u8, d25u8);

	106 q13u8 = vcombine_u8(d26u8, d27u8);

	107

	108 q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),

	109 vreinterpretq_u16_u8(q13u8));

	110 d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));

	111 d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));

	112 d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));

	113 d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));

	114 d0x2u8 = vtrn_u8(d24u8, d25u8);

	115 d1x2u8 = vtrn_u8(d26u8, d27u8);

	116

	117 __builtin_prefetch(src + src_stride * 4);

	118 __builtin_prefetch(src + src_stride * 5);

	119 __builtin_prefetch(src + src_stride * 6);

	120

	121 q8u16 = vmovl_u8(d0x2u8.val[0]);

	122 q9u16 = vmovl_u8(d0x2u8.val[1]);

	123 q10u16 = vmovl_u8(d1x2u8.val[0]);

	124 q11u16 = vmovl_u8(d1x2u8.val[1]);

	125

	126 d16u16 = vget_low_u16(q8u16);

	127 d17u16 = vget_high_u16(q8u16);

	128 d18u16 = vget_low_u16(q9u16);

	129 d19u16 = vget_high_u16(q9u16);

	130 q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18

	131 q9u16 = vcombine_u16(d17u16, d19u16);

	132

	133 d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));

	134 d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21

	135 for (width = w, psrc = src + 7, pdst = dst;

	136 width > 0;

	137 width -= 4, psrc += 4, pdst += 4) { // loop_horiz

	138 s = psrc;

	139 d28u32 = vld1_dup_u32((const uint32_t *)s);

	140 s += src_stride;

	141 d29u32 = vld1_dup_u32((const uint32_t *)s);

	142 s += src_stride;

	143 d31u32 = vld1_dup_u32((const uint32_t *)s);

	144 s += src_stride;

	145 d30u32 = vld1_dup_u32((const uint32_t *)s);

	146

	147 __builtin_prefetch(psrc + 64);

	148

	149 d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),

	150 vreinterpret_u16_u32(d31u32));

	151 d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),

	152 vreinterpret_u16_u32(d30u32));

	153 d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28

	154 vreinterpret_u8_u16(d1x2u16.val[0])); // d29

	155 d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31

	156 vreinterpret_u8_u16(d1x2u16.val[1])); // d30

	157

	158 __builtin_prefetch(psrc + 64 + src_stride);

	159

	160 q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);

	161 q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);

	162 q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),

	163 vreinterpretq_u32_u8(q15u8));

	164

	165 d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));

	166 d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));

	167 q12u16 = vmovl_u8(d28u8);

	168 q13u16 = vmovl_u8(d29u8);

	169

	170 __builtin_prefetch(psrc + 64 + src_stride * 2);

	171

	172 d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));

	173 d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));

	174 d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));

	175 d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));

	176 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));

	177 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));

	178 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));

	179 d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));

	180 d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));

	181

	182 q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,

	183 d18s16, d19s16, d23s16, d24s16, q0s16);

	184 q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,

	185 d19s16, d23s16, d24s16, d26s16, q0s16);

	186 q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,

	187 d23s16, d24s16, d26s16, d27s16, q0s16);

	188 q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,

	189 d24s16, d26s16, d27s16, d25s16, q0s16);

	190

	191 __builtin_prefetch(psrc + 60 + src_stride * 3);

	192

	193 d2u16 = vqrshrun_n_s32(q1s32, 7);

	194 d3u16 = vqrshrun_n_s32(q2s32, 7);

	195 d4u16 = vqrshrun_n_s32(q14s32, 7);

	196 d5u16 = vqrshrun_n_s32(q15s32, 7);

	197

	198 q1u16 = vcombine_u16(d2u16, d3u16);

	199 q2u16 = vcombine_u16(d4u16, d5u16);

	200

	201 d2u8 = vqmovn_u16(q1u16);

	202 d3u8 = vqmovn_u16(q2u16);

	203

	204 d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),

	205 vreinterpret_u16_u8(d3u8));

	206 d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),

	207 vreinterpret_u32_u16(d0x2u16.val[1]));

	208 d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),

	209 vreinterpret_u8_u32(d0x2u32.val[1]));

	210

	211 d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]);

	212 d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]);

	213

	214 d = pdst;

	215 vst1_lane_u32((uint32_t *)d, d2u32, 0);

	216 d += dst_stride;

	217 vst1_lane_u32((uint32_t *)d, d3u32, 0);

	218 d += dst_stride;

	219 vst1_lane_u32((uint32_t *)d, d2u32, 1);

	220 d += dst_stride;

	221 vst1_lane_u32((uint32_t *)d, d3u32, 1);

	222

	223 q8u16 = q9u16;

	224 d20s16 = d23s16;

	225 q11u16 = q12u16;

	226 q9u16 = q13u16;

	227 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));

	228 }

	229 }

	230 return;

	231 }

	232

	233 void vp9_convolve8_vert_neon(

	234 uint8_t *src,

	235 ptrdiff_t src_stride,

	236 uint8_t *dst,

	237 ptrdiff_t dst_stride,

	238 const int16_t *filter_x, // unused

	239 int x_step_q4, // unused

	240 const int16_t *filter_y,

	241 int y_step_q4,

	242 int w,

	243 int h) {

	244 int height;

	245 uint8_t s, d;

	246 uint32x2_t d2u32, d3u32;

	247 uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;

	248 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;

	249 int16x4_t d24s16, d25s16, d26s16, d27s16;

	250 uint16x4_t d2u16, d3u16, d4u16, d5u16;

	251 int16x8_t q0s16;

	252 uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;

	253 int32x4_t q1s32, q2s32, q14s32, q15s32;

	254

	255 if (y_step_q4 != 16) {

	256 vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,

	257 filter_x, x_step_q4,

	258 filter_y, y_step_q4, w, h);

	259 return;

	260 }

	261

	262 src -= src_stride * 3;

	263 q0s16 = vld1q_s16(filter_y);

	264 for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h

	265 s = src;

	266 d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);

	267 s += src_stride;

	268 d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);

	269 s += src_stride;

	270 d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);

	271 s += src_stride;

	272 d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);

	273 s += src_stride;

	274 d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);

	275 s += src_stride;

	276 d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);

	277 s += src_stride;

	278 d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);

	279 s += src_stride;

	280

	281 q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));

	282 q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));

	283 q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));

	284 q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));

	285

	286 d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));

	287 d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));

	288 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));

	289 d = dst;

	290 for (height = h; height > 0; height -= 4) { // loop_vert

	291 d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);

	292 s += src_stride;

	293 d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);

	294 s += src_stride;

	295 d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);

	296 s += src_stride;

	297 d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);

	298 s += src_stride;

	299

	300 q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));

	301 q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));

	302

	303 d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));

	304 d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));

	305 d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));

	306 d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));

	307 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));

	308 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));

	309 d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));

	310 d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));

	311

	312 __builtin_prefetch(d);

	313 __builtin_prefetch(d + dst_stride);

	314 q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,

	315 d20s16, d21s16, d22s16, d24s16, q0s16);

	316 __builtin_prefetch(d + dst_stride * 2);

	317 __builtin_prefetch(d + dst_stride * 3);

	318 q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,

	319 d21s16, d22s16, d24s16, d26s16, q0s16);

	320 __builtin_prefetch(s);

	321 __builtin_prefetch(s + src_stride);

	322 q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,

	323 d22s16, d24s16, d26s16, d27s16, q0s16);

	324 __builtin_prefetch(s + src_stride * 2);

	325 __builtin_prefetch(s + src_stride * 3);

	326 q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,

	327 d24s16, d26s16, d27s16, d25s16, q0s16);

	328

	329 d2u16 = vqrshrun_n_s32(q1s32, 7);

	330 d3u16 = vqrshrun_n_s32(q2s32, 7);

	331 d4u16 = vqrshrun_n_s32(q14s32, 7);

	332 d5u16 = vqrshrun_n_s32(q15s32, 7);

	333

	334 q1u16 = vcombine_u16(d2u16, d3u16);

	335 q2u16 = vcombine_u16(d4u16, d5u16);

	336

	337 d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16));

	338 d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16));

	339

	340 vst1_lane_u32((uint32_t *)d, d2u32, 0);

	341 d += dst_stride;

	342 vst1_lane_u32((uint32_t *)d, d2u32, 1);

	343 d += dst_stride;

	344 vst1_lane_u32((uint32_t *)d, d3u32, 0);

	345 d += dst_stride;

	346 vst1_lane_u32((uint32_t *)d, d3u32, 1);

	347 d += dst_stride;

	348

	349 q8u16 = q10u16;

	350 d18s16 = d22s16;

	351 d19s16 = d24s16;

	352 q10u16 = q13u16;

	353 d22s16 = d25s16;

	354 }

	355 }

	356 return;

	357 }

OLD	NEW