Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(56)

Side by Side Diff: source/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.c

Issue 592203002: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 unsigned int vp8_mse16x16_neon(
14 const unsigned char *src_ptr,
15 int source_stride,
16 const unsigned char *ref_ptr,
17 int recon_stride,
18 unsigned int *sse) {
19 int i;
20 int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
21 int64x1_t d0s64;
22 uint8x16_t q0u8, q1u8, q2u8, q3u8;
23 int32x4_t q7s32, q8s32, q9s32, q10s32;
24 uint16x8_t q11u16, q12u16, q13u16, q14u16;
25 int64x2_t q1s64;
26
27 q7s32 = vdupq_n_s32(0);
28 q8s32 = vdupq_n_s32(0);
29 q9s32 = vdupq_n_s32(0);
30 q10s32 = vdupq_n_s32(0);
31
32 for (i = 0; i < 8; i++) { // mse16x16_neon_loop
33 q0u8 = vld1q_u8(src_ptr);
34 src_ptr += source_stride;
35 q1u8 = vld1q_u8(src_ptr);
36 src_ptr += source_stride;
37 q2u8 = vld1q_u8(ref_ptr);
38 ref_ptr += recon_stride;
39 q3u8 = vld1q_u8(ref_ptr);
40 ref_ptr += recon_stride;
41
42 q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
43 q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
44 q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
45 q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
46
47 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
48 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
49 q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
50 q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
51
52 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
53 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
54 q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
55 q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
56
57 d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
58 d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
59 q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
60 q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
61
62 d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
63 d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
64 q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
65 q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
66 }
67
68 q7s32 = vaddq_s32(q7s32, q8s32);
69 q9s32 = vaddq_s32(q9s32, q10s32);
70 q10s32 = vaddq_s32(q7s32, q9s32);
71
72 q1s64 = vpaddlq_s32(q10s32);
73 d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
74
75 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
76 return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
77 }
78
79 unsigned int vp8_get4x4sse_cs_neon(
80 const unsigned char *src_ptr,
81 int source_stride,
82 const unsigned char *ref_ptr,
83 int recon_stride) {
84 int16x4_t d22s16, d24s16, d26s16, d28s16;
85 int64x1_t d0s64;
86 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
87 int32x4_t q7s32, q8s32, q9s32, q10s32;
88 uint16x8_t q11u16, q12u16, q13u16, q14u16;
89 int64x2_t q1s64;
90
91 d0u8 = vld1_u8(src_ptr);
92 src_ptr += source_stride;
93 d4u8 = vld1_u8(ref_ptr);
94 ref_ptr += recon_stride;
95 d1u8 = vld1_u8(src_ptr);
96 src_ptr += source_stride;
97 d5u8 = vld1_u8(ref_ptr);
98 ref_ptr += recon_stride;
99 d2u8 = vld1_u8(src_ptr);
100 src_ptr += source_stride;
101 d6u8 = vld1_u8(ref_ptr);
102 ref_ptr += recon_stride;
103 d3u8 = vld1_u8(src_ptr);
104 src_ptr += source_stride;
105 d7u8 = vld1_u8(ref_ptr);
106 ref_ptr += recon_stride;
107
108 q11u16 = vsubl_u8(d0u8, d4u8);
109 q12u16 = vsubl_u8(d1u8, d5u8);
110 q13u16 = vsubl_u8(d2u8, d6u8);
111 q14u16 = vsubl_u8(d3u8, d7u8);
112
113 d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
114 d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
115 d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
116 d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
117
118 q7s32 = vmull_s16(d22s16, d22s16);
119 q8s32 = vmull_s16(d24s16, d24s16);
120 q9s32 = vmull_s16(d26s16, d26s16);
121 q10s32 = vmull_s16(d28s16, d28s16);
122
123 q7s32 = vaddq_s32(q7s32, q8s32);
124 q9s32 = vaddq_s32(q9s32, q10s32);
125 q9s32 = vaddq_s32(q7s32, q9s32);
126
127 q1s64 = vpaddlq_s32(q9s32);
128 d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
129
130 return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
131 }
OLDNEW
« no previous file with comments | « source/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm ('k') | source/libvpx/vp8/encoder/denoising.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698