Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(154)

Side by Side Diff: source/libvpx/third_party/libyuv/source/scale_neon64.cc

Issue 996503002: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include "libyuv/scale.h"
11 #include "libyuv/row.h" 12 #include "libyuv/row.h"
13 #include "libyuv/scale_row.h"
12 14
13 #ifdef __cplusplus 15 #ifdef __cplusplus
14 namespace libyuv { 16 namespace libyuv {
15 extern "C" { 17 extern "C" {
16 #endif 18 #endif
17 19
18 // This module is for GCC Neon. 20 // This module is for GCC Neon armv8 64 bit.
19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) 21 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
20 #ifdef HAS_SCALEROWDOWN2_NEON 22
21 // Read 32x1 throw away even pixels, and write 16x1. 23 // Read 32x1 throw away even pixels, and write 16x1.
22 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 24 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
23 uint8* dst, int dst_width) { 25 uint8* dst, int dst_width) {
24 asm volatile ( 26 asm volatile (
25 ".p2align 2 \n"
26 "1: \n" 27 "1: \n"
27 // load even pixels into q0, odd into q1 28 // load even pixels into v0, odd into v1
28 MEMACCESS(0) 29 MEMACCESS(0)
29 "vld2.8 {q0, q1}, [%0]! \n" 30 "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
30 "subs %2, %2, #16 \n" // 16 processed per loop 31 "subs %2, %2, #16 \n" // 16 processed per loop
31 MEMACCESS(1) 32 MEMACCESS(1)
32 "vst1.8 {q1}, [%1]! \n" // store odd pixels 33 "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
33 "bgt 1b \n" 34 "b.gt 1b \n"
34 : "+r"(src_ptr), // %0 35 : "+r"(src_ptr), // %0
35 "+r"(dst), // %1 36 "+r"(dst), // %1
36 "+r"(dst_width) // %2 37 "+r"(dst_width) // %2
37 : 38 :
38 : "q0", "q1" // Clobber List 39 : "v0", "v1" // Clobber List
39 ); 40 );
40 } 41 }
41 #endif //HAS_SCALEROWDOWN2_NEON
42 42
43 #ifdef HAS_SCALEROWDOWN2_NEON
44 // Read 32x2 average down and write 16x1. 43 // Read 32x2 average down and write 16x1.
45 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 44 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
46 uint8* dst, int dst_width) { 45 uint8* dst, int dst_width) {
47 asm volatile ( 46 asm volatile (
48 // change the stride to row 2 pointer 47 // change the stride to row 2 pointer
49 "add %1, %0 \n" 48 "add %1, %1, %0 \n"
50 ".p2align 2 \n"
51 "1: \n" 49 "1: \n"
52 MEMACCESS(0) 50 MEMACCESS(0)
53 "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc 51 "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load row 1 and post inc
54 MEMACCESS(1) 52 MEMACCESS(1)
55 "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc 53 "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
56 "subs %3, %3, #16 \n" // 16 processed per loop 54 "subs %3, %3, #16 \n" // 16 processed per loop
57 "vpaddl.u8 q0, q0 \n" // row 1 add adjacent 55 "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
58 "vpaddl.u8 q1, q1 \n" 56 "uaddlp v1.8h, v1.16b \n"
59 "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1 57 "uadalp v0.8h, v2.16b \n" // row 2 add adjacent + row1
60 "vpadal.u8 q1, q3 \n" 58 "uadalp v1.8h, v3.16b \n"
61 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack 59 "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack
62 "vrshrn.u16 d1, q1, #2 \n" 60 "rshrn2 v0.16b, v1.8h, #2 \n"
63 MEMACCESS(2) 61 MEMACCESS(2)
64 "vst1.8 {q0}, [%2]! \n" 62 "st1 {v0.16b}, [%2], #16 \n"
65 "bgt 1b \n" 63 "b.gt 1b \n"
66 : "+r"(src_ptr), // %0 64 : "+r"(src_ptr), // %0
67 "+r"(src_stride), // %1 65 "+r"(src_stride), // %1
68 "+r"(dst), // %2 66 "+r"(dst), // %2
69 "+r"(dst_width) // %3 67 "+r"(dst_width) // %3
70 : 68 :
71 : "q0", "q1", "q2", "q3" // Clobber List 69 : "v0", "v1", "v2", "v3" // Clobber List
72 ); 70 );
73 } 71 }
74 #endif //HAS_SCALEROWDOWN2_NEON
75 72
76 #ifdef HAS_SCALEROWDOWN4_NEON
77 void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 73 void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
78 uint8* dst_ptr, int dst_width) { 74 uint8* dst_ptr, int dst_width) {
79 asm volatile ( 75 asm volatile (
80 ".p2align 2 \n"
81 "1: \n" 76 "1: \n"
82 MEMACCESS(0) 77 MEMACCESS(0)
83 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 78 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
84 "subs %2, %2, #8 \n" // 8 processed per loop 79 "subs %2, %2, #8 \n" // 8 processed per loop
85 MEMACCESS(1) 80 MEMACCESS(1)
86 "vst1.8 {d2}, [%1]! \n" 81 "st1 {v2.8b}, [%1], #8 \n"
87 "bgt 1b \n" 82 "b.gt 1b \n"
88 : "+r"(src_ptr), // %0 83 : "+r"(src_ptr), // %0
89 "+r"(dst_ptr), // %1 84 "+r"(dst_ptr), // %1
90 "+r"(dst_width) // %2 85 "+r"(dst_width) // %2
91 : 86 :
92 : "q0", "q1", "memory", "cc" 87 : "v0", "v1", "v2", "v3", "memory", "cc"
93 ); 88 );
94 } 89 }
95 #endif //HAS_SCALEROWDOWN4_NEON
96 90
97 #ifdef HAS_SCALEROWDOWN4_NEON
98 void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 91 void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
99 uint8* dst_ptr, int dst_width) { 92 uint8* dst_ptr, int dst_width) {
100 const uint8* src_ptr1 = src_ptr + src_stride; 93 const uint8* src_ptr1 = src_ptr + src_stride;
101 const uint8* src_ptr2 = src_ptr + src_stride * 2; 94 const uint8* src_ptr2 = src_ptr + src_stride * 2;
102 const uint8* src_ptr3 = src_ptr + src_stride * 3; 95 const uint8* src_ptr3 = src_ptr + src_stride * 3;
103 asm volatile ( 96 asm volatile (
104 ".p2align 2 \n"
105 "1: \n" 97 "1: \n"
106 MEMACCESS(0) 98 MEMACCESS(0)
107 "vld1.8 {q0}, [%0]! \n" // load up 16x4 99 "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
108 MEMACCESS(3) 100 MEMACCESS(3)
109 "vld1.8 {q1}, [%3]! \n" 101 "ld1 {v1.16b}, [%2], #16 \n"
110 MEMACCESS(4) 102 MEMACCESS(4)
111 "vld1.8 {q2}, [%4]! \n" 103 "ld1 {v2.16b}, [%3], #16 \n"
112 MEMACCESS(5) 104 MEMACCESS(5)
113 "vld1.8 {q3}, [%5]! \n" 105 "ld1 {v3.16b}, [%4], #16 \n"
114 "subs %2, %2, #4 \n" 106 "subs %5, %5, #4 \n"
115 "vpaddl.u8 q0, q0 \n" 107 "uaddlp v0.8h, v0.16b \n"
116 "vpadal.u8 q0, q1 \n" 108 "uadalp v0.8h, v1.16b \n"
117 "vpadal.u8 q0, q2 \n" 109 "uadalp v0.8h, v2.16b \n"
118 "vpadal.u8 q0, q3 \n" 110 "uadalp v0.8h, v3.16b \n"
119 "vpaddl.u16 q0, q0 \n" 111 "addp v0.8h, v0.8h, v0.8h \n"
120 "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding 112 "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
121 "vmovn.u16 d0, q0 \n"
122 MEMACCESS(1) 113 MEMACCESS(1)
123 "vst1.32 {d0[0]}, [%1]! \n" 114 "st1 {v0.s}[0], [%1], #4 \n"
124 "bgt 1b \n" 115 "b.gt 1b \n"
125 : "+r"(src_ptr), // %0 116 : "+r"(src_ptr), // %0
126 "+r"(dst_ptr), // %1 117 "+r"(dst_ptr), // %1
127 "+r"(dst_width), // %2 118 "+r"(src_ptr1), // %2
128 "+r"(src_ptr1), // %3 119 "+r"(src_ptr2), // %3
129 "+r"(src_ptr2), // %4 120 "+r"(src_ptr3), // %4
130 "+r"(src_ptr3) // %5 121 "+r"(dst_width) // %5
131 : 122 :
132 : "q0", "q1", "q2", "q3", "memory", "cc" 123 : "v0", "v1", "v2", "v3", "memory", "cc"
133 ); 124 );
134 } 125 }
135 #endif //HAS_SCALEROWDOWN4_NEON
136 126
137 #ifdef HAS_SCALEROWDOWN34_NEON
138 // Down scale from 4 to 3 pixels. Use the neon multilane read/write 127 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
139 // to load up the every 4th pixel into a 4 different registers. 128 // to load up the every 4th pixel into a 4 different registers.
140 // Point samples 32 pixels to 24 pixels. 129 // Point samples 32 pixels to 24 pixels.
141 void ScaleRowDown34_NEON(const uint8* src_ptr, 130 void ScaleRowDown34_NEON(const uint8* src_ptr,
142 ptrdiff_t src_stride, 131 ptrdiff_t src_stride,
143 uint8* dst_ptr, int dst_width) { 132 uint8* dst_ptr, int dst_width) {
144 asm volatile ( 133 asm volatile (
145 ".p2align 2 \n" 134 "1: \n"
146 "1: \n"
147 MEMACCESS(0) 135 MEMACCESS(0)
148 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 136 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src l ine 0
149 "subs %2, %2, #24 \n" 137 "subs %2, %2, #24 \n"
150 "vmov d2, d3 \n" // order d0, d1, d2 138 "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2
151 MEMACCESS(1) 139 MEMACCESS(1)
152 "vst3.8 {d0, d1, d2}, [%1]! \n" 140 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
153 "bgt 1b \n" 141 "b.gt 1b \n"
154 : "+r"(src_ptr), // %0 142 : "+r"(src_ptr), // %0
155 "+r"(dst_ptr), // %1 143 "+r"(dst_ptr), // %1
156 "+r"(dst_width) // %2 144 "+r"(dst_width) // %2
157 : 145 :
158 : "d0", "d1", "d2", "d3", "memory", "cc" 146 : "v0", "v1", "v2", "v3", "memory", "cc"
159 ); 147 );
160 } 148 }
161 #endif //HAS_SCALEROWDOWN34_NEON
162 149
163 #ifdef HAS_SCALEROWDOWN34_NEON
164 void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, 150 void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
165 ptrdiff_t src_stride, 151 ptrdiff_t src_stride,
166 uint8* dst_ptr, int dst_width) { 152 uint8* dst_ptr, int dst_width) {
167 asm volatile ( 153 asm volatile (
168 "vmov.u8 d24, #3 \n" 154 "movi v20.8b, #3 \n"
169 "add %3, %0 \n" 155 "add %3, %3, %0 \n"
170 ".p2align 2 \n" 156 "1: \n"
171 "1: \n"
172 MEMACCESS(0) 157 MEMACCESS(0)
173 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 158 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src l ine 0
174 MEMACCESS(3) 159 MEMACCESS(3)
175 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 160 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src l ine 1
176 "subs %2, %2, #24 \n" 161 "subs %2, %2, #24 \n"
177 162
178 // filter src line 0 with src line 1 163 // filter src line 0 with src line 1
179 // expand chars to shorts to allow for room 164 // expand chars to shorts to allow for room
180 // when adding lines together 165 // when adding lines together
181 "vmovl.u8 q8, d4 \n" 166 "ushll v16.8h, v4.8b, #0 \n"
182 "vmovl.u8 q9, d5 \n" 167 "ushll v17.8h, v5.8b, #0 \n"
183 "vmovl.u8 q10, d6 \n" 168 "ushll v18.8h, v6.8b, #0 \n"
184 "vmovl.u8 q11, d7 \n" 169 "ushll v19.8h, v7.8b, #0 \n"
185 170
186 // 3 * line_0 + line_1 171 // 3 * line_0 + line_1
187 "vmlal.u8 q8, d0, d24 \n" 172 "umlal v16.8h, v0.8b, v20.8b \n"
188 "vmlal.u8 q9, d1, d24 \n" 173 "umlal v17.8h, v1.8b, v20.8b \n"
189 "vmlal.u8 q10, d2, d24 \n" 174 "umlal v18.8h, v2.8b, v20.8b \n"
190 "vmlal.u8 q11, d3, d24 \n" 175 "umlal v19.8h, v3.8b, v20.8b \n"
191 176
192 // (3 * line_0 + line_1) >> 2 177 // (3 * line_0 + line_1) >> 2
193 "vqrshrn.u16 d0, q8, #2 \n" 178 "uqrshrn v0.8b, v16.8h, #2 \n"
194 "vqrshrn.u16 d1, q9, #2 \n" 179 "uqrshrn v1.8b, v17.8h, #2 \n"
195 "vqrshrn.u16 d2, q10, #2 \n" 180 "uqrshrn v2.8b, v18.8h, #2 \n"
196 "vqrshrn.u16 d3, q11, #2 \n" 181 "uqrshrn v3.8b, v19.8h, #2 \n"
197 182
198 // a0 = (src[0] * 3 + s[1] * 1) >> 2 183 // a0 = (src[0] * 3 + s[1] * 1) >> 2
199 "vmovl.u8 q8, d1 \n" 184 "ushll v16.8h, v1.8b, #0 \n"
200 "vmlal.u8 q8, d0, d24 \n" 185 "umlal v16.8h, v0.8b, v20.8b \n"
201 "vqrshrn.u16 d0, q8, #2 \n" 186 "uqrshrn v0.8b, v16.8h, #2 \n"
202 187
203 // a1 = (src[1] * 1 + s[2] * 1) >> 1 188 // a1 = (src[1] * 1 + s[2] * 1) >> 1
204 "vrhadd.u8 d1, d1, d2 \n" 189 "urhadd v1.8b, v1.8b, v2.8b \n"
205 190
206 // a2 = (src[2] * 1 + s[3] * 3) >> 2 191 // a2 = (src[2] * 1 + s[3] * 3) >> 2
207 "vmovl.u8 q8, d2 \n" 192 "ushll v16.8h, v2.8b, #0 \n"
208 "vmlal.u8 q8, d3, d24 \n" 193 "umlal v16.8h, v3.8b, v20.8b \n"
209 "vqrshrn.u16 d2, q8, #2 \n" 194 "uqrshrn v2.8b, v16.8h, #2 \n"
210 195
211 MEMACCESS(1) 196 MEMACCESS(1)
212 "vst3.8 {d0, d1, d2}, [%1]! \n" 197 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
213 198
214 "bgt 1b \n" 199 "b.gt 1b \n"
215 : "+r"(src_ptr), // %0 200 : "+r"(src_ptr), // %0
216 "+r"(dst_ptr), // %1 201 "+r"(dst_ptr), // %1
217 "+r"(dst_width), // %2 202 "+r"(dst_width), // %2
218 "+r"(src_stride) // %3 203 "+r"(src_stride) // %3
219 : 204 :
220 : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc" 205 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19",
206 "v20", "memory", "cc"
221 ); 207 );
222 } 208 }
223 #endif //ScaleRowDown34_0_Box_NEON
224 209
225 #ifdef HAS_SCALEROWDOWN34_NEON
226 void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, 210 void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
227 ptrdiff_t src_stride, 211 ptrdiff_t src_stride,
228 uint8* dst_ptr, int dst_width) { 212 uint8* dst_ptr, int dst_width) {
229 asm volatile ( 213 asm volatile (
230 "vmov.u8 d24, #3 \n" 214 "movi v20.8b, #3 \n"
231 "add %3, %0 \n" 215 "add %3, %3, %0 \n"
232 ".p2align 2 \n" 216 "1: \n"
233 "1: \n"
234 MEMACCESS(0) 217 MEMACCESS(0)
235 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 218 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src l ine 0
236 MEMACCESS(3) 219 MEMACCESS(3)
237 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 220 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src l ine 1
238 "subs %2, %2, #24 \n" 221 "subs %2, %2, #24 \n"
239 // average src line 0 with src line 1 222 // average src line 0 with src line 1
240 "vrhadd.u8 q0, q0, q2 \n" 223 "urhadd v0.8b, v0.8b, v4.8b \n"
241 "vrhadd.u8 q1, q1, q3 \n" 224 "urhadd v1.8b, v1.8b, v5.8b \n"
225 "urhadd v2.8b, v2.8b, v6.8b \n"
226 "urhadd v3.8b, v3.8b, v7.8b \n"
242 227
243 // a0 = (src[0] * 3 + s[1] * 1) >> 2 228 // a0 = (src[0] * 3 + s[1] * 1) >> 2
244 "vmovl.u8 q3, d1 \n" 229 "ushll v4.8h, v1.8b, #0 \n"
245 "vmlal.u8 q3, d0, d24 \n" 230 "umlal v4.8h, v0.8b, v20.8b \n"
246 "vqrshrn.u16 d0, q3, #2 \n" 231 "uqrshrn v0.8b, v4.8h, #2 \n"
247 232
248 // a1 = (src[1] * 1 + s[2] * 1) >> 1 233 // a1 = (src[1] * 1 + s[2] * 1) >> 1
249 "vrhadd.u8 d1, d1, d2 \n" 234 "urhadd v1.8b, v1.8b, v2.8b \n"
250 235
251 // a2 = (src[2] * 1 + s[3] * 3) >> 2 236 // a2 = (src[2] * 1 + s[3] * 3) >> 2
252 "vmovl.u8 q3, d2 \n" 237 "ushll v4.8h, v2.8b, #0 \n"
253 "vmlal.u8 q3, d3, d24 \n" 238 "umlal v4.8h, v3.8b, v20.8b \n"
254 "vqrshrn.u16 d2, q3, #2 \n" 239 "uqrshrn v2.8b, v4.8h, #2 \n"
255 240
256 MEMACCESS(1) 241 MEMACCESS(1)
257 "vst3.8 {d0, d1, d2}, [%1]! \n" 242 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
258 "bgt 1b \n" 243 "b.gt 1b \n"
259 : "+r"(src_ptr), // %0 244 : "+r"(src_ptr), // %0
260 "+r"(dst_ptr), // %1 245 "+r"(dst_ptr), // %1
261 "+r"(dst_width), // %2 246 "+r"(dst_width), // %2
262 "+r"(src_stride) // %3 247 "+r"(src_stride) // %3
263 : 248 :
264 : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc" 249 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"
265 ); 250 );
266 } 251 }
267 #endif //HAS_SCALEROWDOWN34_NEON
268 252
269 #ifdef HAS_SCALEROWDOWN38_NEON
270 #define HAS_SCALEROWDOWN38_NEON
271 static uvec8 kShuf38 = 253 static uvec8 kShuf38 =
272 { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; 254 { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
273 static uvec8 kShuf38_2 = 255 static uvec8 kShuf38_2 =
274 { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 }; 256 { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 };
275 static vec16 kMult38_Div6 = 257 static vec16 kMult38_Div6 =
276 { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 258 { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
277 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; 259 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
278 static vec16 kMult38_Div9 = 260 static vec16 kMult38_Div9 =
279 { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 261 { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
280 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; 262 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
281 263
282 // 32 -> 12 264 // 32 -> 12
283 void ScaleRowDown38_NEON(const uint8* src_ptr, 265 void ScaleRowDown38_NEON(const uint8* src_ptr,
284 ptrdiff_t src_stride, 266 ptrdiff_t src_stride,
285 uint8* dst_ptr, int dst_width) { 267 uint8* dst_ptr, int dst_width) {
286 asm volatile ( 268 asm volatile (
287 MEMACCESS(3) 269 MEMACCESS(3)
288 "vld1.8 {q3}, [%3] \n" 270 "ld1 {v3.16b}, [%3] \n"
289 ".p2align 2 \n" 271 "1: \n"
290 "1: \n" 272 MEMACCESS(0)
291 MEMACCESS(0) 273 "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
292 "vld1.8 {d0, d1, d2, d3}, [%0]! \n" 274 "subs %2, %2, #12 \n"
293 "subs %2, %2, #12 \n" 275 "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
294 "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" 276 MEMACCESS(1)
295 "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" 277 "st1 {v2.8b}, [%1], #8 \n"
296 MEMACCESS(1) 278 MEMACCESS(1)
297 "vst1.8 {d4}, [%1]! \n" 279 "st1 {v2.s}[2], [%1], #4 \n"
298 MEMACCESS(1) 280 "b.gt 1b \n"
299 "vst1.32 {d5[0]}, [%1]! \n"
300 "bgt 1b \n"
301 : "+r"(src_ptr), // %0 281 : "+r"(src_ptr), // %0
302 "+r"(dst_ptr), // %1 282 "+r"(dst_ptr), // %1
303 "+r"(dst_width) // %2 283 "+r"(dst_width) // %2
304 : "r"(&kShuf38) // %3 284 : "r"(&kShuf38) // %3
305 : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc" 285 : "v0", "v1", "v2", "v3", "memory", "cc"
306 ); 286 );
307 } 287 }
308 288
309 #endif //HAS_SCALEROWDOWN38_NEON
310
311 #ifdef HAS_SCALEROWDOWN38_NEON
312 // 32x3 -> 12x1 289 // 32x3 -> 12x1
313 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, 290 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
314 ptrdiff_t src_stride, 291 ptrdiff_t src_stride,
315 uint8* dst_ptr, int dst_width) { 292 uint8* dst_ptr, int dst_width) {
316 const uint8* src_ptr1 = src_ptr + src_stride * 2; 293 const uint8* src_ptr1 = src_ptr + src_stride * 2;
294 ptrdiff_t tmp_src_stride = src_stride;
317 295
318 asm volatile ( 296 asm volatile (
319 MEMACCESS(5) 297 MEMACCESS(5)
320 "vld1.16 {q13}, [%5] \n" 298 "ld1 {v29.8h}, [%5] \n"
321 MEMACCESS(6) 299 MEMACCESS(6)
322 "vld1.8 {q14}, [%6] \n" 300 "ld1 {v30.16b}, [%6] \n"
323 MEMACCESS(7) 301 MEMACCESS(7)
324 "vld1.8 {q15}, [%7] \n" 302 "ld1 {v31.8h}, [%7] \n"
325 "add %3, %0 \n" 303 "add %2, %2, %0 \n"
326 ".p2align 2 \n" 304 "1: \n"
327 "1: \n" 305
328 306 // 00 40 01 41 02 42 03 43
329 // d0 = 00 40 01 41 02 42 03 43 307 // 10 50 11 51 12 52 13 53
330 // d1 = 10 50 11 51 12 52 13 53 308 // 20 60 21 61 22 62 23 63
331 // d2 = 20 60 21 61 22 62 23 63 309 // 30 70 31 71 32 72 33 73
332 // d3 = 30 70 31 71 32 72 33 73 310 MEMACCESS(0)
333 MEMACCESS(0) 311 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
334 "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
335 MEMACCESS(3) 312 MEMACCESS(3)
336 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" 313 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
337 MEMACCESS(4) 314 MEMACCESS(4)
338 "vld4.8 {d16, d17, d18, d19}, [%4]! \n" 315 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
339 "subs %2, %2, #12 \n" 316 "subs %4, %4, #12 \n"
340 317
341 // Shuffle the input data around to get align the data 318 // Shuffle the input data around to get align the data
342 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 319 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
343 // d0 = 00 10 01 11 02 12 03 13 320 // 00 10 01 11 02 12 03 13
344 // d1 = 40 50 41 51 42 52 43 53 321 // 40 50 41 51 42 52 43 53
345 "vtrn.u8 d0, d1 \n" 322 "trn1 v20.8b, v0.8b, v1.8b \n"
346 "vtrn.u8 d4, d5 \n" 323 "trn2 v21.8b, v0.8b, v1.8b \n"
347 "vtrn.u8 d16, d17 \n" 324 "trn1 v22.8b, v4.8b, v5.8b \n"
348 325 "trn2 v23.8b, v4.8b, v5.8b \n"
349 // d2 = 20 30 21 31 22 32 23 33 326 "trn1 v24.8b, v16.8b, v17.8b \n"
350 // d3 = 60 70 61 71 62 72 63 73 327 "trn2 v25.8b, v16.8b, v17.8b \n"
351 "vtrn.u8 d2, d3 \n" 328
352 "vtrn.u8 d6, d7 \n" 329 // 20 30 21 31 22 32 23 33
353 "vtrn.u8 d18, d19 \n" 330 // 60 70 61 71 62 72 63 73
354 331 "trn1 v0.8b, v2.8b, v3.8b \n"
355 // d0 = 00+10 01+11 02+12 03+13 332 "trn2 v1.8b, v2.8b, v3.8b \n"
356 // d2 = 40+50 41+51 42+52 43+53 333 "trn1 v4.8b, v6.8b, v7.8b \n"
357 "vpaddl.u8 q0, q0 \n" 334 "trn2 v5.8b, v6.8b, v7.8b \n"
358 "vpaddl.u8 q2, q2 \n" 335 "trn1 v16.8b, v18.8b, v19.8b \n"
359 "vpaddl.u8 q8, q8 \n" 336 "trn2 v17.8b, v18.8b, v19.8b \n"
360 337
361 // d3 = 60+70 61+71 62+72 63+73 338 // 00+10 01+11 02+12 03+13
362 "vpaddl.u8 d3, d3 \n" 339 // 40+50 41+51 42+52 43+53
363 "vpaddl.u8 d7, d7 \n" 340 "uaddlp v20.4h, v20.8b \n"
364 "vpaddl.u8 d19, d19 \n" 341 "uaddlp v21.4h, v21.8b \n"
342 "uaddlp v22.4h, v22.8b \n"
343 "uaddlp v23.4h, v23.8b \n"
344 "uaddlp v24.4h, v24.8b \n"
345 "uaddlp v25.4h, v25.8b \n"
346
347 // 60+70 61+71 62+72 63+73
348 "uaddlp v1.4h, v1.8b \n"
349 "uaddlp v5.4h, v5.8b \n"
350 "uaddlp v17.4h, v17.8b \n"
365 351
366 // combine source lines 352 // combine source lines
367 "vadd.u16 q0, q2 \n" 353 "add v20.4h, v20.4h, v22.4h \n"
368 "vadd.u16 q0, q8 \n" 354 "add v21.4h, v21.4h, v23.4h \n"
369 "vadd.u16 d4, d3, d7 \n" 355 "add v20.4h, v20.4h, v24.4h \n"
370 "vadd.u16 d4, d19 \n" 356 "add v21.4h, v21.4h, v25.4h \n"
357 "add v2.4h, v1.4h, v5.4h \n"
358 "add v2.4h, v2.4h, v17.4h \n"
371 359
372 // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] 360 // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
373 // + s[6 + st * 1] + s[7 + st * 1] 361 // + s[6 + st * 1] + s[7 + st * 1]
374 // + s[6 + st * 2] + s[7 + st * 2]) / 6 362 // + s[6 + st * 2] + s[7 + st * 2]) / 6
375 "vqrdmulh.s16 q2, q2, q13 \n" 363 "sqrdmulh v2.8h, v2.8h, v29.8h \n"
376 "vmovn.u16 d4, q2 \n" 364 "xtn v2.8b, v2.8h \n"
377 365
378 // Shuffle 2,3 reg around so that 2 can be added to the 366 // Shuffle 2,3 reg around so that 2 can be added to the
379 // 0,1 reg and 3 can be added to the 4,5 reg. This 367 // 0,1 reg and 3 can be added to the 4,5 reg. This
380 // requires expanding from u8 to u16 as the 0,1 and 4,5 368 // requires expanding from u8 to u16 as the 0,1 and 4,5
381 // registers are already expanded. Then do transposes 369 // registers are already expanded. Then do transposes
382 // to get aligned. 370 // to get aligned.
383 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 371 // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
384 "vmovl.u8 q1, d2 \n" 372 "ushll v16.8h, v16.8b, #0 \n"
385 "vmovl.u8 q3, d6 \n" 373 "uaddl v0.8h, v0.8b, v4.8b \n"
386 "vmovl.u8 q9, d18 \n"
387 374
388 // combine source lines 375 // combine source lines
389 "vadd.u16 q1, q3 \n" 376 "add v0.8h, v0.8h, v16.8h \n"
390 "vadd.u16 q1, q9 \n" 377
391 378 // xx 20 xx 21 xx 22 xx 23
392 // d4 = xx 20 xx 30 xx 22 xx 32 379 // xx 30 xx 31 xx 32 xx 33
393 // d5 = xx 21 xx 31 xx 23 xx 33 380 "trn1 v1.8h, v0.8h, v0.8h \n"
394 "vtrn.u32 d2, d3 \n" 381 "trn2 v4.8h, v0.8h, v0.8h \n"
395 382 "xtn v0.4h, v1.4s \n"
396 // d4 = xx 20 xx 21 xx 22 xx 23 383 "xtn v4.4h, v4.4s \n"
397 // d5 = xx 30 xx 31 xx 32 xx 33
398 "vtrn.u16 d2, d3 \n"
399 384
400 // 0+1+2, 3+4+5 385 // 0+1+2, 3+4+5
401 "vadd.u16 q0, q1 \n" 386 "add v20.8h, v20.8h, v0.8h \n"
387 "add v21.8h, v21.8h, v4.8h \n"
402 388
403 // Need to divide, but can't downshift as the the value 389 // Need to divide, but can't downshift as the the value
404 // isn't a power of 2. So multiply by 65536 / n 390 // isn't a power of 2. So multiply by 65536 / n
405 // and take the upper 16 bits. 391 // and take the upper 16 bits.
406 "vqrdmulh.s16 q0, q0, q15 \n" 392 "sqrdmulh v0.8h, v20.8h, v31.8h \n"
393 "sqrdmulh v1.8h, v21.8h, v31.8h \n"
407 394
408 // Align for table lookup, vtbl requires registers to 395 // Align for table lookup, vtbl requires registers to
409 // be adjacent 396 // be adjacent
410 "vmov.u8 d2, d4 \n" 397 "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
411 398
412 "vtbl.u8 d3, {d0, d1, d2}, d28 \n" 399 MEMACCESS(1)
413 "vtbl.u8 d4, {d0, d1, d2}, d29 \n" 400 "st1 {v3.8b}, [%1], #8 \n"
414 401 MEMACCESS(1)
415 MEMACCESS(1) 402 "st1 {v3.s}[2], [%1], #4 \n"
416 "vst1.8 {d3}, [%1]! \n" 403 "b.gt 1b \n"
417 MEMACCESS(1)
418 "vst1.32 {d4[0]}, [%1]! \n"
419 "bgt 1b \n"
420 : "+r"(src_ptr), // %0 404 : "+r"(src_ptr), // %0
421 "+r"(dst_ptr), // %1 405 "+r"(dst_ptr), // %1
422 "+r"(dst_width), // %2 406 "+r"(tmp_src_stride), // %2
423 "+r"(src_stride), // %3 407 "+r"(src_ptr1), // %3
424 "+r"(src_ptr1) // %4 408 "+r"(dst_width) // %4
425 : "r"(&kMult38_Div6), // %5 409 : "r"(&kMult38_Div6), // %5
426 "r"(&kShuf38_2), // %6 410 "r"(&kShuf38_2), // %6
427 "r"(&kMult38_Div9) // %7 411 "r"(&kMult38_Div9) // %7
428 : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc" 412 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
429 ); 413 "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29",
430 } 414 "v30", "v31", "memory", "cc"
431 #endif //HAS_SCALEROWDOWN38_NEON 415 );
432 416 }
433 #ifdef HAS_SCALEROWDOWN38_NEON 417
434 // 32x2 -> 12x1 418 // 32x2 -> 12x1
435 void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, 419 void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
436 ptrdiff_t src_stride, 420 ptrdiff_t src_stride,
437 uint8* dst_ptr, int dst_width) { 421 uint8* dst_ptr, int dst_width) {
422 // TODO(fbarchard): use src_stride directly for clang 3.5+.
423 ptrdiff_t tmp_src_stride = src_stride;
438 asm volatile ( 424 asm volatile (
439 MEMACCESS(4) 425 MEMACCESS(4)
440 "vld1.16 {q13}, [%4] \n" 426 "ld1 {v30.8h}, [%4] \n"
441 MEMACCESS(5) 427 MEMACCESS(5)
442 "vld1.8 {q14}, [%5] \n" 428 "ld1 {v31.16b}, [%5] \n"
443 "add %3, %0 \n" 429 "add %2, %2, %0 \n"
444 ".p2align 2 \n" 430 "1: \n"
445 "1: \n" 431
446 432 // 00 40 01 41 02 42 03 43
447 // d0 = 00 40 01 41 02 42 03 43 433 // 10 50 11 51 12 52 13 53
448 // d1 = 10 50 11 51 12 52 13 53 434 // 20 60 21 61 22 62 23 63
449 // d2 = 20 60 21 61 22 62 23 63 435 // 30 70 31 71 32 72 33 73
450 // d3 = 30 70 31 71 32 72 33 73 436 MEMACCESS(0)
451 MEMACCESS(0) 437 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
452 "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
453 MEMACCESS(3) 438 MEMACCESS(3)
454 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" 439 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
455 "subs %2, %2, #12 \n" 440 "subs %3, %3, #12 \n"
456 441
457 // Shuffle the input data around to get align the data 442 // Shuffle the input data around to get align the data
458 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 443 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
459 // d0 = 00 10 01 11 02 12 03 13 444 // 00 10 01 11 02 12 03 13
460 // d1 = 40 50 41 51 42 52 43 53 445 // 40 50 41 51 42 52 43 53
461 "vtrn.u8 d0, d1 \n" 446 "trn1 v16.8b, v0.8b, v1.8b \n"
462 "vtrn.u8 d4, d5 \n" 447 "trn2 v17.8b, v0.8b, v1.8b \n"
463 448 "trn1 v18.8b, v4.8b, v5.8b \n"
464 // d2 = 20 30 21 31 22 32 23 33 449 "trn2 v19.8b, v4.8b, v5.8b \n"
465 // d3 = 60 70 61 71 62 72 63 73 450
466 "vtrn.u8 d2, d3 \n" 451 // 20 30 21 31 22 32 23 33
467 "vtrn.u8 d6, d7 \n" 452 // 60 70 61 71 62 72 63 73
468 453 "trn1 v0.8b, v2.8b, v3.8b \n"
469 // d0 = 00+10 01+11 02+12 03+13 454 "trn2 v1.8b, v2.8b, v3.8b \n"
470 // d2 = 40+50 41+51 42+52 43+53 455 "trn1 v4.8b, v6.8b, v7.8b \n"
471 "vpaddl.u8 q0, q0 \n" 456 "trn2 v5.8b, v6.8b, v7.8b \n"
472 "vpaddl.u8 q2, q2 \n" 457
473 458 // 00+10 01+11 02+12 03+13
474 // d3 = 60+70 61+71 62+72 63+73 459 // 40+50 41+51 42+52 43+53
475 "vpaddl.u8 d3, d3 \n" 460 "uaddlp v16.4h, v16.8b \n"
476 "vpaddl.u8 d7, d7 \n" 461 "uaddlp v17.4h, v17.8b \n"
462 "uaddlp v18.4h, v18.8b \n"
463 "uaddlp v19.4h, v19.8b \n"
464
465 // 60+70 61+71 62+72 63+73
466 "uaddlp v1.4h, v1.8b \n"
467 "uaddlp v5.4h, v5.8b \n"
477 468
478 // combine source lines 469 // combine source lines
479 "vadd.u16 q0, q2 \n" 470 "add v16.4h, v16.4h, v18.4h \n"
480 "vadd.u16 d4, d3, d7 \n" 471 "add v17.4h, v17.4h, v19.4h \n"
472 "add v2.4h, v1.4h, v5.4h \n"
481 473
482 // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 474 // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
483 "vqrshrn.u16 d4, q2, #2 \n" 475 "uqrshrn v2.8b, v2.8h, #2 \n"
484 476
485 // Shuffle 2,3 reg around so that 2 can be added to the 477 // Shuffle 2,3 reg around so that 2 can be added to the
486 // 0,1 reg and 3 can be added to the 4,5 reg. This 478 // 0,1 reg and 3 can be added to the 4,5 reg. This
487 // requires expanding from u8 to u16 as the 0,1 and 4,5 479 // requires expanding from u8 to u16 as the 0,1 and 4,5
488 // registers are already expanded. Then do transposes 480 // registers are already expanded. Then do transposes
489 // to get aligned. 481 // to get aligned.
490 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 482 // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
491 "vmovl.u8 q1, d2 \n"
492 "vmovl.u8 q3, d6 \n"
493 483
494 // combine source lines 484 // combine source lines
495 "vadd.u16 q1, q3 \n" 485 "uaddl v0.8h, v0.8b, v4.8b \n"
496 486
497 // d4 = xx 20 xx 30 xx 22 xx 32 487 // xx 20 xx 21 xx 22 xx 23
498 // d5 = xx 21 xx 31 xx 23 xx 33 488 // xx 30 xx 31 xx 32 xx 33
499 "vtrn.u32 d2, d3 \n" 489 "trn1 v1.8h, v0.8h, v0.8h \n"
500 490 "trn2 v4.8h, v0.8h, v0.8h \n"
501 // d4 = xx 20 xx 21 xx 22 xx 23 491 "xtn v0.4h, v1.4s \n"
502 // d5 = xx 30 xx 31 xx 32 xx 33 492 "xtn v4.4h, v4.4s \n"
503 "vtrn.u16 d2, d3 \n"
504 493
505 // 0+1+2, 3+4+5 494 // 0+1+2, 3+4+5
506 "vadd.u16 q0, q1 \n" 495 "add v16.8h, v16.8h, v0.8h \n"
496 "add v17.8h, v17.8h, v4.8h \n"
507 497
508 // Need to divide, but can't downshift as the the value 498 // Need to divide, but can't downshift as the the value
509 // isn't a power of 2. So multiply by 65536 / n 499 // isn't a power of 2. So multiply by 65536 / n
510 // and take the upper 16 bits. 500 // and take the upper 16 bits.
511 "vqrdmulh.s16 q0, q0, q13 \n" 501 "sqrdmulh v0.8h, v16.8h, v30.8h \n"
502 "sqrdmulh v1.8h, v17.8h, v30.8h \n"
512 503
513 // Align for table lookup, vtbl requires registers to 504 // Align for table lookup, vtbl requires registers to
514 // be adjacent 505 // be adjacent
515 "vmov.u8 d2, d4 \n" 506
516 507 "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
517 "vtbl.u8 d3, {d0, d1, d2}, d28 \n" 508
518 "vtbl.u8 d4, {d0, d1, d2}, d29 \n" 509 MEMACCESS(1)
519 510 "st1 {v3.8b}, [%1], #8 \n"
520 MEMACCESS(1) 511 MEMACCESS(1)
521 "vst1.8 {d3}, [%1]! \n" 512 "st1 {v3.s}[2], [%1], #4 \n"
522 MEMACCESS(1) 513 "b.gt 1b \n"
523 "vst1.32 {d4[0]}, [%1]! \n" 514 : "+r"(src_ptr), // %0
524 "bgt 1b \n" 515 "+r"(dst_ptr), // %1
525 : "+r"(src_ptr), // %0 516 "+r"(tmp_src_stride), // %2
526 "+r"(dst_ptr), // %1 517 "+r"(dst_width) // %3
527 "+r"(dst_width), // %2 518 : "r"(&kMult38_Div6), // %4
528 "+r"(src_stride) // %3 519 "r"(&kShuf38_2) // %5
529 : "r"(&kMult38_Div6), // %4 520 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
530 "r"(&kShuf38_2) // %5 521 "v18", "v19", "v30", "v31", "memory", "cc"
531 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" 522 );
532 ); 523 }
533 } 524
534 #endif //HAS_SCALEROWDOWN38_NEON
535
536 #if 0
537 // 16x2 -> 16x1 525 // 16x2 -> 16x1
538 void ScaleFilterRows_NEON(uint8* dst_ptr, 526 void ScaleFilterRows_NEON(uint8* dst_ptr,
539 const uint8* src_ptr, ptrdiff_t src_stride, 527 const uint8* src_ptr, ptrdiff_t src_stride,
540 int dst_width, int source_y_fraction) { 528 int dst_width, int source_y_fraction) {
529 int y_fraction = 256 - source_y_fraction;
541 asm volatile ( 530 asm volatile (
542 "cmp %4, #0 \n" 531 "cmp %4, #0 \n"
543 "beq 100f \n" 532 "b.eq 100f \n"
544 "add %2, %1 \n" 533 "add %2, %2, %1 \n"
545 "cmp %4, #64 \n" 534 "cmp %4, #64 \n"
546 "beq 75f \n" 535 "b.eq 75f \n"
547 "cmp %4, #128 \n" 536 "cmp %4, #128 \n"
548 "beq 50f \n" 537 "b.eq 50f \n"
549 "cmp %4, #192 \n" 538 "cmp %4, #192 \n"
550 "beq 25f \n" 539 "b.eq 25f \n"
551 540
552 "vdup.8 d5, %4 \n" 541 "dup v5.8b, %w4 \n"
553 "rsb %4, #256 \n" 542 "dup v4.8b, %w5 \n"
554 "vdup.8 d4, %4 \n"
555 // General purpose row blend. 543 // General purpose row blend.
556 "1: \n" 544 "1: \n"
557 MEMACCESS(1) 545 MEMACCESS(1)
558 "vld1.8 {q0}, [%1]! \n" 546 "ld1 {v0.16b}, [%1], #16 \n"
559 MEMACCESS(2) 547 MEMACCESS(2)
560 "vld1.8 {q1}, [%2]! \n" 548 "ld1 {v1.16b}, [%2], #16 \n"
561 "subs %3, %3, #16 \n" 549 "subs %3, %3, #16 \n"
562 "vmull.u8 q13, d0, d4 \n" 550 "umull v6.8h, v0.8b, v4.8b \n"
563 "vmull.u8 q14, d1, d4 \n" 551 "umull2 v7.8h, v0.16b, v4.16b \n"
564 "vmlal.u8 q13, d2, d5 \n" 552 "umlal v6.8h, v1.8b, v5.8b \n"
565 "vmlal.u8 q14, d3, d5 \n" 553 "umlal2 v7.8h, v1.16b, v5.16b \n"
566 "vrshrn.u16 d0, q13, #8 \n" 554 "rshrn v0.8b, v6.8h, #8 \n"
567 "vrshrn.u16 d1, q14, #8 \n" 555 "rshrn2 v0.16b, v7.8h, #8 \n"
568 MEMACCESS(0) 556 MEMACCESS(0)
569 "vst1.8 {q0}, [%0]! \n" 557 "st1 {v0.16b}, [%0], #16 \n"
570 "bgt 1b \n" 558 "b.gt 1b \n"
571 "b 99f \n" 559 "b 99f \n"
572 560
573 // Blend 25 / 75. 561 // Blend 25 / 75.
574 "25: \n" 562 "25: \n"
575 MEMACCESS(1) 563 MEMACCESS(1)
576 "vld1.8 {q0}, [%1]! \n" 564 "ld1 {v0.16b}, [%1], #16 \n"
577 MEMACCESS(2) 565 MEMACCESS(2)
578 "vld1.8 {q1}, [%2]! \n" 566 "ld1 {v1.16b}, [%2], #16 \n"
579 "subs %3, %3, #16 \n" 567 "subs %3, %3, #16 \n"
580 "vrhadd.u8 q0, q1 \n" 568 "urhadd v0.16b, v0.16b, v1.16b \n"
581 "vrhadd.u8 q0, q1 \n" 569 "urhadd v0.16b, v0.16b, v1.16b \n"
582 MEMACCESS(0) 570 MEMACCESS(0)
583 "vst1.8 {q0}, [%0]! \n" 571 "st1 {v0.16b}, [%0], #16 \n"
584 "bgt 25b \n" 572 "b.gt 25b \n"
585 "b 99f \n" 573 "b 99f \n"
586 574
587 // Blend 50 / 50. 575 // Blend 50 / 50.
588 "50: \n" 576 "50: \n"
589 MEMACCESS(1) 577 MEMACCESS(1)
590 "vld1.8 {q0}, [%1]! \n" 578 "ld1 {v0.16b}, [%1], #16 \n"
591 MEMACCESS(2) 579 MEMACCESS(2)
592 "vld1.8 {q1}, [%2]! \n" 580 "ld1 {v1.16b}, [%2], #16 \n"
593 "subs %3, %3, #16 \n" 581 "subs %3, %3, #16 \n"
594 "vrhadd.u8 q0, q1 \n" 582 "urhadd v0.16b, v0.16b, v1.16b \n"
595 MEMACCESS(0) 583 MEMACCESS(0)
596 "vst1.8 {q0}, [%0]! \n" 584 "st1 {v0.16b}, [%0], #16 \n"
597 "bgt 50b \n" 585 "b.gt 50b \n"
598 "b 99f \n" 586 "b 99f \n"
599 587
600 // Blend 75 / 25. 588 // Blend 75 / 25.
601 "75: \n" 589 "75: \n"
602 MEMACCESS(1) 590 MEMACCESS(1)
603 "vld1.8 {q1}, [%1]! \n" 591 "ld1 {v1.16b}, [%1], #16 \n"
604 MEMACCESS(2) 592 MEMACCESS(2)
605 "vld1.8 {q0}, [%2]! \n" 593 "ld1 {v0.16b}, [%2], #16 \n"
606 "subs %3, %3, #16 \n" 594 "subs %3, %3, #16 \n"
607 "vrhadd.u8 q0, q1 \n" 595 "urhadd v0.16b, v0.16b, v1.16b \n"
608 "vrhadd.u8 q0, q1 \n" 596 "urhadd v0.16b, v0.16b, v1.16b \n"
609 MEMACCESS(0) 597 MEMACCESS(0)
610 "vst1.8 {q0}, [%0]! \n" 598 "st1 {v0.16b}, [%0], #16 \n"
611 "bgt 75b \n" 599 "b.gt 75b \n"
612 "b 99f \n" 600 "b 99f \n"
613 601
614 // Blend 100 / 0 - Copy row unchanged. 602 // Blend 100 / 0 - Copy row unchanged.
615 "100: \n" 603 "100: \n"
616 MEMACCESS(1) 604 MEMACCESS(1)
617 "vld1.8 {q0}, [%1]! \n" 605 "ld1 {v0.16b}, [%1], #16 \n"
618 "subs %3, %3, #16 \n" 606 "subs %3, %3, #16 \n"
619 MEMACCESS(0) 607 MEMACCESS(0)
620 "vst1.8 {q0}, [%0]! \n" 608 "st1 {v0.16b}, [%0], #16 \n"
621 "bgt 100b \n" 609 "b.gt 100b \n"
622 610
623 "99: \n" 611 "99: \n"
624 MEMACCESS(0) 612 MEMACCESS(0)
625 "vst1.8 {d1[7]}, [%0] \n" 613 "st1 {v0.b}[15], [%0] \n"
626 : "+r"(dst_ptr), // %0 614 : "+r"(dst_ptr), // %0
627 "+r"(src_ptr), // %1 615 "+r"(src_ptr), // %1
628 "+r"(src_stride), // %2 616 "+r"(src_stride), // %2
629 "+r"(dst_width), // %3 617 "+r"(dst_width), // %3
630 "+r"(source_y_fraction) // %4 618 "+r"(source_y_fraction),// %4
619 "+r"(y_fraction) // %5
631 : 620 :
632 : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc" 621 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"
633 ); 622 );
634 } 623 }
635 #endif //0 624
636
637 #ifdef HAS_SCALEARGBROWDOWN2_NEON
638 void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 625 void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
639 uint8* dst, int dst_width) { 626 uint8* dst, int dst_width) {
640 asm volatile ( 627 asm volatile (
641 ".p2align 2 \n"
642 "1: \n" 628 "1: \n"
643 // load even pixels into q0, odd into q1 629 // load even pixels into q0, odd into q1
644 MEMACCESS(0) 630 MEMACCESS (0)
645 "vld2.32 {q0, q1}, [%0]! \n" 631 "ld2 {v0.4s, v1.4s}, [%0], #32 \n"
646 MEMACCESS(0) 632 MEMACCESS (0)
647 "vld2.32 {q2, q3}, [%0]! \n" 633 "ld2 {v2.4s, v3.4s}, [%0], #32 \n"
648 "subs %2, %2, #8 \n" // 8 processed per loop 634 "subs %2, %2, #8 \n" // 8 processed per loop
649 MEMACCESS(1) 635 MEMACCESS (1)
650 "vst1.8 {q1}, [%1]! \n" // store odd pixels 636 "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
651 MEMACCESS(1) 637 MEMACCESS (1)
652 "vst1.8 {q3}, [%1]! \n" 638 "st1 {v3.16b}, [%1], #16 \n"
653 "bgt 1b \n" 639 "b.gt 1b \n"
654 : "+r"(src_ptr), // %0 640 : "+r" (src_ptr), // %0
655 "+r"(dst), // %1 641 "+r" (dst), // %1
656 "+r"(dst_width) // %2 642 "+r" (dst_width) // %2
657 : 643 :
658 : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List 644 : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
659 ); 645 );
660 } 646 }
661 #endif //HAS_SCALEARGBROWDOWN2_NEON 647
662
663 #ifdef HAS_SCALEARGBROWDOWN2_NEON
664 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 648 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
665 uint8* dst, int dst_width) { 649 uint8* dst, int dst_width) {
666 asm volatile ( 650 asm volatile (
667 // change the stride to row 2 pointer 651 // change the stride to row 2 pointer
668 "add %1, %1, %0 \n" 652 "add %1, %1, %0 \n"
669 ".p2align 2 \n"
670 "1: \n" 653 "1: \n"
671 MEMACCESS(0) 654 MEMACCESS (0)
672 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. 655 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB p ixels.
673 MEMACCESS(0)
674 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
675 "subs %3, %3, #8 \n" // 8 processed per loop. 656 "subs %3, %3, #8 \n" // 8 processed per loop.
676 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. 657 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
677 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. 658 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
678 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. 659 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
679 "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. 660 "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
680 MEMACCESS(1) 661 MEMACCESS (1)
681 "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels. 662 "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels.
682 MEMACCESS(1) 663 "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
683 "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels. 664 "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
684 "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. 665 "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
685 "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. 666 "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
686 "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. 667 "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack
687 "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. 668 "rshrn v1.8b, v1.8h, #2 \n"
688 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack 669 "rshrn v2.8b, v2.8h, #2 \n"
689 "vrshrn.u16 d1, q1, #2 \n" 670 "rshrn v3.8b, v3.8h, #2 \n"
690 "vrshrn.u16 d2, q2, #2 \n" 671 MEMACCESS (2)
691 "vrshrn.u16 d3, q3, #2 \n" 672 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
692 MEMACCESS(2) 673 "b.gt 1b \n"
693 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" 674 : "+r" (src_ptr), // %0
694 "bgt 1b \n" 675 "+r" (src_stride), // %1
695 : "+r"(src_ptr), // %0 676 "+r" (dst), // %2
696 "+r"(src_stride), // %1 677 "+r" (dst_width) // %3
697 "+r"(dst), // %2
698 "+r"(dst_width) // %3
699 : 678 :
700 : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" 679 : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"
701 ); 680 );
702 } 681 }
703 #endif //HAS_SCALEARGBROWDOWN2_NEON 682
704
705 #ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
706 // Reads 4 pixels at a time. 683 // Reads 4 pixels at a time.
707 // Alignment requirement: src_argb 4 byte aligned. 684 // Alignment requirement: src_argb 4 byte aligned.
708 void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, 685 void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
709 int src_stepx, uint8* dst_argb, int dst_width) { 686 int src_stepx, uint8* dst_argb, int dst_width) {
710 asm volatile ( 687 asm volatile (
711 "mov r12, %3, lsl #2 \n"
712 ".p2align 2 \n"
713 "1: \n" 688 "1: \n"
714 MEMACCESS(0) 689 MEMACCESS(0)
715 "vld1.32 {d0[0]}, [%0], r12 \n" 690 "ld1 {v0.s}[0], [%0], %3 \n"
716 MEMACCESS(0) 691 MEMACCESS(0)
717 "vld1.32 {d0[1]}, [%0], r12 \n" 692 "ld1 {v0.s}[1], [%0], %3 \n"
718 MEMACCESS(0) 693 MEMACCESS(0)
719 "vld1.32 {d1[0]}, [%0], r12 \n" 694 "ld1 {v0.s}[2], [%0], %3 \n"
720 MEMACCESS(0) 695 MEMACCESS(0)
721 "vld1.32 {d1[1]}, [%0], r12 \n" 696 "ld1 {v0.s}[3], [%0], %3 \n"
722 "subs %2, %2, #4 \n" // 4 pixels per loop. 697 "subs %2, %2, #4 \n" // 4 pixels per loop.
723 MEMACCESS(1) 698 MEMACCESS(1)
724 "vst1.8 {q0}, [%1]! \n" 699 "st1 {v0.16b}, [%1], #16 \n"
725 "bgt 1b \n" 700 "b.gt 1b \n"
726 : "+r"(src_argb), // %0 701 : "+r"(src_argb), // %0
727 "+r"(dst_argb), // %1 702 "+r"(dst_argb), // %1
728 "+r"(dst_width) // %2 703 "+r"(dst_width) // %2
729 : "r"(src_stepx) // %3 704 : "r"(static_cast<ptrdiff_t>(src_stepx * 4)) // %3
730 : "memory", "cc", "r12", "q0" 705 : "memory", "cc", "v0"
731 ); 706 );
732 } 707 }
733 #endif //HAS_SCALEARGBROWDOWNEVEN_NEON 708
734
735 #ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
736 // Reads 4 pixels at a time. 709 // Reads 4 pixels at a time.
737 // Alignment requirement: src_argb 4 byte aligned. 710 // Alignment requirement: src_argb 4 byte aligned.
711 // TODO, might be worth another optimization pass in future.
712 // It could be upgraded to 8 pixels at a time to start with.
738 void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, 713 void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
739 int src_stepx, 714 int src_stepx,
740 uint8* dst_argb, int dst_width) { 715 uint8* dst_argb, int dst_width) {
741 asm volatile ( 716 asm volatile (
742 "mov r12, %4, lsl #2 \n"
743 "add %1, %1, %0 \n" 717 "add %1, %1, %0 \n"
744 ".p2align 2 \n"
745 "1: \n" 718 "1: \n"
746 MEMACCESS(0) 719 MEMACCESS(0)
747 "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1 720 "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1
748 MEMACCESS(1) 721 MEMACCESS(1)
749 "vld1.8 {d1}, [%1], r12 \n" 722 "ld1 {v1.8b}, [%1], %4 \n"
750 MEMACCESS(0) 723 MEMACCESS(0)
751 "vld1.8 {d2}, [%0], r12 \n" 724 "ld1 {v2.8b}, [%0], %4 \n"
752 MEMACCESS(1) 725 MEMACCESS(1)
753 "vld1.8 {d3}, [%1], r12 \n" 726 "ld1 {v3.8b}, [%1], %4 \n"
754 MEMACCESS(0) 727 MEMACCESS(0)
755 "vld1.8 {d4}, [%0], r12 \n" 728 "ld1 {v4.8b}, [%0], %4 \n"
756 MEMACCESS(1) 729 MEMACCESS(1)
757 "vld1.8 {d5}, [%1], r12 \n" 730 "ld1 {v5.8b}, [%1], %4 \n"
758 MEMACCESS(0) 731 MEMACCESS(0)
759 "vld1.8 {d6}, [%0], r12 \n" 732 "ld1 {v6.8b}, [%0], %4 \n"
760 MEMACCESS(1) 733 MEMACCESS(1)
761 "vld1.8 {d7}, [%1], r12 \n" 734 "ld1 {v7.8b}, [%1], %4 \n"
762 "vaddl.u8 q0, d0, d1 \n" 735 "uaddl v0.8h, v0.8b, v1.8b \n"
763 "vaddl.u8 q1, d2, d3 \n" 736 "uaddl v2.8h, v2.8b, v3.8b \n"
764 "vaddl.u8 q2, d4, d5 \n" 737 "uaddl v4.8h, v4.8b, v5.8b \n"
765 "vaddl.u8 q3, d6, d7 \n" 738 "uaddl v6.8h, v6.8b, v7.8b \n"
766 "vswp.8 d1, d2 \n" // ab_cd -> ac_bd 739 "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
767 "vswp.8 d5, d6 \n" // ef_gh -> eg_fh 740 "mov v0.d[1], v2.d[0] \n"
768 "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) 741 "mov v2.d[0], v16.d[1] \n"
769 "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) 742 "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
770 "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. 743 "mov v4.d[1], v6.d[0] \n"
771 "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. 744 "mov v6.d[0], v16.d[1] \n"
745 "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
746 "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
747 "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
748 "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
772 "subs %3, %3, #4 \n" // 4 pixels per loop. 749 "subs %3, %3, #4 \n" // 4 pixels per loop.
773 MEMACCESS(2) 750 MEMACCESS(2)
774 "vst1.8 {q0}, [%2]! \n" 751 "st1 {v0.16b}, [%2], #16 \n"
775 "bgt 1b \n" 752 "b.gt 1b \n"
776 : "+r"(src_argb), // %0 753 : "+r"(src_argb), // %0
777 "+r"(src_stride), // %1 754 "+r"(src_stride), // %1
778 "+r"(dst_argb), // %2 755 "+r"(dst_argb), // %2
779 "+r"(dst_width) // %3 756 "+r"(dst_width) // %3
780 : "r"(src_stepx) // %4 757 : "r"(src_stepx * 4) // %4
781 : "memory", "cc", "r12", "q0", "q1", "q2", "q3" 758 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
782 ); 759 );
783 } 760 }
784 #endif // HAS_SCALEARGBROWDOWNEVEN_NEON 761 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
785 #endif // __aarch64__
786 762
787 #ifdef __cplusplus 763 #ifdef __cplusplus
788 } // extern "C" 764 } // extern "C"
789 } // namespace libyuv 765 } // namespace libyuv
790 #endif 766 #endif
OLDNEW
« no previous file with comments | « source/libvpx/third_party/libyuv/source/scale_neon.cc ('k') | source/libvpx/third_party/libyuv/source/scale_posix.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698