OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. |
| 3 * |
| 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ |
| 10 |
| 11 #include "libyuv/row.h" |
| 12 |
| 13 #ifdef __cplusplus |
| 14 namespace libyuv { |
| 15 extern "C" { |
| 16 #endif |
| 17 |
| 18 // This module is for GCC Neon. |
| 19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) |
| 20 #ifdef HAS_SCALEROWDOWN2_NEON |
| 21 // Read 32x1 throw away even pixels, and write 16x1. |
| 22 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, |
| 23 uint8* dst, int dst_width) { |
| 24 asm volatile ( |
| 25 ".p2align 2 \n" |
| 26 "1: \n" |
| 27 // load even pixels into q0, odd into q1 |
| 28 MEMACCESS(0) |
| 29 "vld2.8 {q0, q1}, [%0]! \n" |
| 30 "subs %2, %2, #16 \n" // 16 processed per loop |
| 31 MEMACCESS(1) |
| 32 "vst1.8 {q1}, [%1]! \n" // store odd pixels |
| 33 "bgt 1b \n" |
| 34 : "+r"(src_ptr), // %0 |
| 35 "+r"(dst), // %1 |
| 36 "+r"(dst_width) // %2 |
| 37 : |
| 38 : "q0", "q1" // Clobber List |
| 39 ); |
| 40 } |
| 41 #endif //HAS_SCALEROWDOWN2_NEON |
| 42 |
| 43 #ifdef HAS_SCALEROWDOWN2_NEON |
| 44 // Read 32x2 average down and write 16x1. |
| 45 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, |
| 46 uint8* dst, int dst_width) { |
| 47 asm volatile ( |
| 48 // change the stride to row 2 pointer |
| 49 "add %1, %0 \n" |
| 50 ".p2align 2 \n" |
| 51 "1: \n" |
| 52 MEMACCESS(0) |
| 53 "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc |
| 54 MEMACCESS(1) |
| 55 "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc |
| 56 "subs %3, %3, #16 \n" // 16 processed per loop |
| 57 "vpaddl.u8 q0, q0 \n" // row 1 add adjacent |
| 58 "vpaddl.u8 q1, q1 \n" |
| 59 "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1 |
| 60 "vpadal.u8 q1, q3 \n" |
| 61 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack |
| 62 "vrshrn.u16 d1, q1, #2 \n" |
| 63 MEMACCESS(2) |
| 64 "vst1.8 {q0}, [%2]! \n" |
| 65 "bgt 1b \n" |
| 66 : "+r"(src_ptr), // %0 |
| 67 "+r"(src_stride), // %1 |
| 68 "+r"(dst), // %2 |
| 69 "+r"(dst_width) // %3 |
| 70 : |
| 71 : "q0", "q1", "q2", "q3" // Clobber List |
| 72 ); |
| 73 } |
| 74 #endif //HAS_SCALEROWDOWN2_NEON |
| 75 |
| 76 #ifdef HAS_SCALEROWDOWN4_NEON |
| 77 void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, |
| 78 uint8* dst_ptr, int dst_width) { |
| 79 asm volatile ( |
| 80 ".p2align 2 \n" |
| 81 "1: \n" |
| 82 MEMACCESS(0) |
| 83 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 |
| 84 "subs %2, %2, #8 \n" // 8 processed per loop |
| 85 MEMACCESS(1) |
| 86 "vst1.8 {d2}, [%1]! \n" |
| 87 "bgt 1b \n" |
| 88 : "+r"(src_ptr), // %0 |
| 89 "+r"(dst_ptr), // %1 |
| 90 "+r"(dst_width) // %2 |
| 91 : |
| 92 : "q0", "q1", "memory", "cc" |
| 93 ); |
| 94 } |
| 95 #endif //HAS_SCALEROWDOWN4_NEON |
| 96 |
| 97 #ifdef HAS_SCALEROWDOWN4_NEON |
| 98 void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, |
| 99 uint8* dst_ptr, int dst_width) { |
| 100 const uint8* src_ptr1 = src_ptr + src_stride; |
| 101 const uint8* src_ptr2 = src_ptr + src_stride * 2; |
| 102 const uint8* src_ptr3 = src_ptr + src_stride * 3; |
| 103 asm volatile ( |
| 104 ".p2align 2 \n" |
| 105 "1: \n" |
| 106 MEMACCESS(0) |
| 107 "vld1.8 {q0}, [%0]! \n" // load up 16x4 |
| 108 MEMACCESS(3) |
| 109 "vld1.8 {q1}, [%3]! \n" |
| 110 MEMACCESS(4) |
| 111 "vld1.8 {q2}, [%4]! \n" |
| 112 MEMACCESS(5) |
| 113 "vld1.8 {q3}, [%5]! \n" |
| 114 "subs %2, %2, #4 \n" |
| 115 "vpaddl.u8 q0, q0 \n" |
| 116 "vpadal.u8 q0, q1 \n" |
| 117 "vpadal.u8 q0, q2 \n" |
| 118 "vpadal.u8 q0, q3 \n" |
| 119 "vpaddl.u16 q0, q0 \n" |
| 120 "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding |
| 121 "vmovn.u16 d0, q0 \n" |
| 122 MEMACCESS(1) |
| 123 "vst1.32 {d0[0]}, [%1]! \n" |
| 124 "bgt 1b \n" |
| 125 : "+r"(src_ptr), // %0 |
| 126 "+r"(dst_ptr), // %1 |
| 127 "+r"(dst_width), // %2 |
| 128 "+r"(src_ptr1), // %3 |
| 129 "+r"(src_ptr2), // %4 |
| 130 "+r"(src_ptr3) // %5 |
| 131 : |
| 132 : "q0", "q1", "q2", "q3", "memory", "cc" |
| 133 ); |
| 134 } |
| 135 #endif //HAS_SCALEROWDOWN4_NEON |
| 136 |
| 137 #ifdef HAS_SCALEROWDOWN34_NEON |
| 138 // Down scale from 4 to 3 pixels. Use the neon multilane read/write |
| 139 // to load up the every 4th pixel into a 4 different registers. |
| 140 // Point samples 32 pixels to 24 pixels. |
| 141 void ScaleRowDown34_NEON(const uint8* src_ptr, |
| 142 ptrdiff_t src_stride, |
| 143 uint8* dst_ptr, int dst_width) { |
| 144 asm volatile ( |
| 145 ".p2align 2 \n" |
| 146 "1: \n" |
| 147 MEMACCESS(0) |
| 148 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 |
| 149 "subs %2, %2, #24 \n" |
| 150 "vmov d2, d3 \n" // order d0, d1, d2 |
| 151 MEMACCESS(1) |
| 152 "vst3.8 {d0, d1, d2}, [%1]! \n" |
| 153 "bgt 1b \n" |
| 154 : "+r"(src_ptr), // %0 |
| 155 "+r"(dst_ptr), // %1 |
| 156 "+r"(dst_width) // %2 |
| 157 : |
| 158 : "d0", "d1", "d2", "d3", "memory", "cc" |
| 159 ); |
| 160 } |
| 161 #endif //HAS_SCALEROWDOWN34_NEON |
| 162 |
| 163 #ifdef HAS_SCALEROWDOWN34_NEON |
| 164 void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, |
| 165 ptrdiff_t src_stride, |
| 166 uint8* dst_ptr, int dst_width) { |
| 167 asm volatile ( |
| 168 "vmov.u8 d24, #3 \n" |
| 169 "add %3, %0 \n" |
| 170 ".p2align 2 \n" |
| 171 "1: \n" |
| 172 MEMACCESS(0) |
| 173 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 |
| 174 MEMACCESS(3) |
| 175 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 |
| 176 "subs %2, %2, #24 \n" |
| 177 |
| 178 // filter src line 0 with src line 1 |
| 179 // expand chars to shorts to allow for room |
| 180 // when adding lines together |
| 181 "vmovl.u8 q8, d4 \n" |
| 182 "vmovl.u8 q9, d5 \n" |
| 183 "vmovl.u8 q10, d6 \n" |
| 184 "vmovl.u8 q11, d7 \n" |
| 185 |
| 186 // 3 * line_0 + line_1 |
| 187 "vmlal.u8 q8, d0, d24 \n" |
| 188 "vmlal.u8 q9, d1, d24 \n" |
| 189 "vmlal.u8 q10, d2, d24 \n" |
| 190 "vmlal.u8 q11, d3, d24 \n" |
| 191 |
| 192 // (3 * line_0 + line_1) >> 2 |
| 193 "vqrshrn.u16 d0, q8, #2 \n" |
| 194 "vqrshrn.u16 d1, q9, #2 \n" |
| 195 "vqrshrn.u16 d2, q10, #2 \n" |
| 196 "vqrshrn.u16 d3, q11, #2 \n" |
| 197 |
| 198 // a0 = (src[0] * 3 + s[1] * 1) >> 2 |
| 199 "vmovl.u8 q8, d1 \n" |
| 200 "vmlal.u8 q8, d0, d24 \n" |
| 201 "vqrshrn.u16 d0, q8, #2 \n" |
| 202 |
| 203 // a1 = (src[1] * 1 + s[2] * 1) >> 1 |
| 204 "vrhadd.u8 d1, d1, d2 \n" |
| 205 |
| 206 // a2 = (src[2] * 1 + s[3] * 3) >> 2 |
| 207 "vmovl.u8 q8, d2 \n" |
| 208 "vmlal.u8 q8, d3, d24 \n" |
| 209 "vqrshrn.u16 d2, q8, #2 \n" |
| 210 |
| 211 MEMACCESS(1) |
| 212 "vst3.8 {d0, d1, d2}, [%1]! \n" |
| 213 |
| 214 "bgt 1b \n" |
| 215 : "+r"(src_ptr), // %0 |
| 216 "+r"(dst_ptr), // %1 |
| 217 "+r"(dst_width), // %2 |
| 218 "+r"(src_stride) // %3 |
| 219 : |
| 220 : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc" |
| 221 ); |
| 222 } |
| 223 #endif //ScaleRowDown34_0_Box_NEON |
| 224 |
| 225 #ifdef HAS_SCALEROWDOWN34_NEON |
| 226 void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, |
| 227 ptrdiff_t src_stride, |
| 228 uint8* dst_ptr, int dst_width) { |
| 229 asm volatile ( |
| 230 "vmov.u8 d24, #3 \n" |
| 231 "add %3, %0 \n" |
| 232 ".p2align 2 \n" |
| 233 "1: \n" |
| 234 MEMACCESS(0) |
| 235 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 |
| 236 MEMACCESS(3) |
| 237 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 |
| 238 "subs %2, %2, #24 \n" |
| 239 // average src line 0 with src line 1 |
| 240 "vrhadd.u8 q0, q0, q2 \n" |
| 241 "vrhadd.u8 q1, q1, q3 \n" |
| 242 |
| 243 // a0 = (src[0] * 3 + s[1] * 1) >> 2 |
| 244 "vmovl.u8 q3, d1 \n" |
| 245 "vmlal.u8 q3, d0, d24 \n" |
| 246 "vqrshrn.u16 d0, q3, #2 \n" |
| 247 |
| 248 // a1 = (src[1] * 1 + s[2] * 1) >> 1 |
| 249 "vrhadd.u8 d1, d1, d2 \n" |
| 250 |
| 251 // a2 = (src[2] * 1 + s[3] * 3) >> 2 |
| 252 "vmovl.u8 q3, d2 \n" |
| 253 "vmlal.u8 q3, d3, d24 \n" |
| 254 "vqrshrn.u16 d2, q3, #2 \n" |
| 255 |
| 256 MEMACCESS(1) |
| 257 "vst3.8 {d0, d1, d2}, [%1]! \n" |
| 258 "bgt 1b \n" |
| 259 : "+r"(src_ptr), // %0 |
| 260 "+r"(dst_ptr), // %1 |
| 261 "+r"(dst_width), // %2 |
| 262 "+r"(src_stride) // %3 |
| 263 : |
| 264 : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc" |
| 265 ); |
| 266 } |
| 267 #endif //HAS_SCALEROWDOWN34_NEON |
| 268 |
| 269 #ifdef HAS_SCALEROWDOWN38_NEON |
| 270 #define HAS_SCALEROWDOWN38_NEON |
| 271 static uvec8 kShuf38 = |
| 272 { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; |
| 273 static uvec8 kShuf38_2 = |
| 274 { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 }; |
| 275 static vec16 kMult38_Div6 = |
| 276 { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, |
| 277 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; |
| 278 static vec16 kMult38_Div9 = |
| 279 { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, |
| 280 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; |
| 281 |
| 282 // 32 -> 12 |
| 283 void ScaleRowDown38_NEON(const uint8* src_ptr, |
| 284 ptrdiff_t src_stride, |
| 285 uint8* dst_ptr, int dst_width) { |
| 286 asm volatile ( |
| 287 MEMACCESS(3) |
| 288 "vld1.8 {q3}, [%3] \n" |
| 289 ".p2align 2 \n" |
| 290 "1: \n" |
| 291 MEMACCESS(0) |
| 292 "vld1.8 {d0, d1, d2, d3}, [%0]! \n" |
| 293 "subs %2, %2, #12 \n" |
| 294 "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" |
| 295 "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" |
| 296 MEMACCESS(1) |
| 297 "vst1.8 {d4}, [%1]! \n" |
| 298 MEMACCESS(1) |
| 299 "vst1.32 {d5[0]}, [%1]! \n" |
| 300 "bgt 1b \n" |
| 301 : "+r"(src_ptr), // %0 |
| 302 "+r"(dst_ptr), // %1 |
| 303 "+r"(dst_width) // %2 |
| 304 : "r"(&kShuf38) // %3 |
| 305 : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc" |
| 306 ); |
| 307 } |
| 308 |
| 309 #endif //HAS_SCALEROWDOWN38_NEON |
| 310 |
| 311 #ifdef HAS_SCALEROWDOWN38_NEON |
| 312 // 32x3 -> 12x1 |
| 313 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, |
| 314 ptrdiff_t src_stride, |
| 315 uint8* dst_ptr, int dst_width) { |
| 316 const uint8* src_ptr1 = src_ptr + src_stride * 2; |
| 317 |
| 318 asm volatile ( |
| 319 MEMACCESS(5) |
| 320 "vld1.16 {q13}, [%5] \n" |
| 321 MEMACCESS(6) |
| 322 "vld1.8 {q14}, [%6] \n" |
| 323 MEMACCESS(7) |
| 324 "vld1.8 {q15}, [%7] \n" |
| 325 "add %3, %0 \n" |
| 326 ".p2align 2 \n" |
| 327 "1: \n" |
| 328 |
| 329 // d0 = 00 40 01 41 02 42 03 43 |
| 330 // d1 = 10 50 11 51 12 52 13 53 |
| 331 // d2 = 20 60 21 61 22 62 23 63 |
| 332 // d3 = 30 70 31 71 32 72 33 73 |
| 333 MEMACCESS(0) |
| 334 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" |
| 335 MEMACCESS(3) |
| 336 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" |
| 337 MEMACCESS(4) |
| 338 "vld4.8 {d16, d17, d18, d19}, [%4]! \n" |
| 339 "subs %2, %2, #12 \n" |
| 340 |
| 341 // Shuffle the input data around to get align the data |
| 342 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 |
| 343 // d0 = 00 10 01 11 02 12 03 13 |
| 344 // d1 = 40 50 41 51 42 52 43 53 |
| 345 "vtrn.u8 d0, d1 \n" |
| 346 "vtrn.u8 d4, d5 \n" |
| 347 "vtrn.u8 d16, d17 \n" |
| 348 |
| 349 // d2 = 20 30 21 31 22 32 23 33 |
| 350 // d3 = 60 70 61 71 62 72 63 73 |
| 351 "vtrn.u8 d2, d3 \n" |
| 352 "vtrn.u8 d6, d7 \n" |
| 353 "vtrn.u8 d18, d19 \n" |
| 354 |
| 355 // d0 = 00+10 01+11 02+12 03+13 |
| 356 // d2 = 40+50 41+51 42+52 43+53 |
| 357 "vpaddl.u8 q0, q0 \n" |
| 358 "vpaddl.u8 q2, q2 \n" |
| 359 "vpaddl.u8 q8, q8 \n" |
| 360 |
| 361 // d3 = 60+70 61+71 62+72 63+73 |
| 362 "vpaddl.u8 d3, d3 \n" |
| 363 "vpaddl.u8 d7, d7 \n" |
| 364 "vpaddl.u8 d19, d19 \n" |
| 365 |
| 366 // combine source lines |
| 367 "vadd.u16 q0, q2 \n" |
| 368 "vadd.u16 q0, q8 \n" |
| 369 "vadd.u16 d4, d3, d7 \n" |
| 370 "vadd.u16 d4, d19 \n" |
| 371 |
| 372 // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] |
| 373 // + s[6 + st * 1] + s[7 + st * 1] |
| 374 // + s[6 + st * 2] + s[7 + st * 2]) / 6 |
| 375 "vqrdmulh.s16 q2, q2, q13 \n" |
| 376 "vmovn.u16 d4, q2 \n" |
| 377 |
| 378 // Shuffle 2,3 reg around so that 2 can be added to the |
| 379 // 0,1 reg and 3 can be added to the 4,5 reg. This |
| 380 // requires expanding from u8 to u16 as the 0,1 and 4,5 |
| 381 // registers are already expanded. Then do transposes |
| 382 // to get aligned. |
| 383 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 |
| 384 "vmovl.u8 q1, d2 \n" |
| 385 "vmovl.u8 q3, d6 \n" |
| 386 "vmovl.u8 q9, d18 \n" |
| 387 |
| 388 // combine source lines |
| 389 "vadd.u16 q1, q3 \n" |
| 390 "vadd.u16 q1, q9 \n" |
| 391 |
| 392 // d4 = xx 20 xx 30 xx 22 xx 32 |
| 393 // d5 = xx 21 xx 31 xx 23 xx 33 |
| 394 "vtrn.u32 d2, d3 \n" |
| 395 |
| 396 // d4 = xx 20 xx 21 xx 22 xx 23 |
| 397 // d5 = xx 30 xx 31 xx 32 xx 33 |
| 398 "vtrn.u16 d2, d3 \n" |
| 399 |
| 400 // 0+1+2, 3+4+5 |
| 401 "vadd.u16 q0, q1 \n" |
| 402 |
| 403 // Need to divide, but can't downshift as the the value |
| 404 // isn't a power of 2. So multiply by 65536 / n |
| 405 // and take the upper 16 bits. |
| 406 "vqrdmulh.s16 q0, q0, q15 \n" |
| 407 |
| 408 // Align for table lookup, vtbl requires registers to |
| 409 // be adjacent |
| 410 "vmov.u8 d2, d4 \n" |
| 411 |
| 412 "vtbl.u8 d3, {d0, d1, d2}, d28 \n" |
| 413 "vtbl.u8 d4, {d0, d1, d2}, d29 \n" |
| 414 |
| 415 MEMACCESS(1) |
| 416 "vst1.8 {d3}, [%1]! \n" |
| 417 MEMACCESS(1) |
| 418 "vst1.32 {d4[0]}, [%1]! \n" |
| 419 "bgt 1b \n" |
| 420 : "+r"(src_ptr), // %0 |
| 421 "+r"(dst_ptr), // %1 |
| 422 "+r"(dst_width), // %2 |
| 423 "+r"(src_stride), // %3 |
| 424 "+r"(src_ptr1) // %4 |
| 425 : "r"(&kMult38_Div6), // %5 |
| 426 "r"(&kShuf38_2), // %6 |
| 427 "r"(&kMult38_Div9) // %7 |
| 428 : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc" |
| 429 ); |
| 430 } |
| 431 #endif //HAS_SCALEROWDOWN38_NEON |
| 432 |
| 433 #ifdef HAS_SCALEROWDOWN38_NEON |
| 434 // 32x2 -> 12x1 |
| 435 void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, |
| 436 ptrdiff_t src_stride, |
| 437 uint8* dst_ptr, int dst_width) { |
| 438 asm volatile ( |
| 439 MEMACCESS(4) |
| 440 "vld1.16 {q13}, [%4] \n" |
| 441 MEMACCESS(5) |
| 442 "vld1.8 {q14}, [%5] \n" |
| 443 "add %3, %0 \n" |
| 444 ".p2align 2 \n" |
| 445 "1: \n" |
| 446 |
| 447 // d0 = 00 40 01 41 02 42 03 43 |
| 448 // d1 = 10 50 11 51 12 52 13 53 |
| 449 // d2 = 20 60 21 61 22 62 23 63 |
| 450 // d3 = 30 70 31 71 32 72 33 73 |
| 451 MEMACCESS(0) |
| 452 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" |
| 453 MEMACCESS(3) |
| 454 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" |
| 455 "subs %2, %2, #12 \n" |
| 456 |
| 457 // Shuffle the input data around to get align the data |
| 458 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 |
| 459 // d0 = 00 10 01 11 02 12 03 13 |
| 460 // d1 = 40 50 41 51 42 52 43 53 |
| 461 "vtrn.u8 d0, d1 \n" |
| 462 "vtrn.u8 d4, d5 \n" |
| 463 |
| 464 // d2 = 20 30 21 31 22 32 23 33 |
| 465 // d3 = 60 70 61 71 62 72 63 73 |
| 466 "vtrn.u8 d2, d3 \n" |
| 467 "vtrn.u8 d6, d7 \n" |
| 468 |
| 469 // d0 = 00+10 01+11 02+12 03+13 |
| 470 // d2 = 40+50 41+51 42+52 43+53 |
| 471 "vpaddl.u8 q0, q0 \n" |
| 472 "vpaddl.u8 q2, q2 \n" |
| 473 |
| 474 // d3 = 60+70 61+71 62+72 63+73 |
| 475 "vpaddl.u8 d3, d3 \n" |
| 476 "vpaddl.u8 d7, d7 \n" |
| 477 |
| 478 // combine source lines |
| 479 "vadd.u16 q0, q2 \n" |
| 480 "vadd.u16 d4, d3, d7 \n" |
| 481 |
| 482 // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 |
| 483 "vqrshrn.u16 d4, q2, #2 \n" |
| 484 |
| 485 // Shuffle 2,3 reg around so that 2 can be added to the |
| 486 // 0,1 reg and 3 can be added to the 4,5 reg. This |
| 487 // requires expanding from u8 to u16 as the 0,1 and 4,5 |
| 488 // registers are already expanded. Then do transposes |
| 489 // to get aligned. |
| 490 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 |
| 491 "vmovl.u8 q1, d2 \n" |
| 492 "vmovl.u8 q3, d6 \n" |
| 493 |
| 494 // combine source lines |
| 495 "vadd.u16 q1, q3 \n" |
| 496 |
| 497 // d4 = xx 20 xx 30 xx 22 xx 32 |
| 498 // d5 = xx 21 xx 31 xx 23 xx 33 |
| 499 "vtrn.u32 d2, d3 \n" |
| 500 |
| 501 // d4 = xx 20 xx 21 xx 22 xx 23 |
| 502 // d5 = xx 30 xx 31 xx 32 xx 33 |
| 503 "vtrn.u16 d2, d3 \n" |
| 504 |
| 505 // 0+1+2, 3+4+5 |
| 506 "vadd.u16 q0, q1 \n" |
| 507 |
| 508 // Need to divide, but can't downshift as the the value |
| 509 // isn't a power of 2. So multiply by 65536 / n |
| 510 // and take the upper 16 bits. |
| 511 "vqrdmulh.s16 q0, q0, q13 \n" |
| 512 |
| 513 // Align for table lookup, vtbl requires registers to |
| 514 // be adjacent |
| 515 "vmov.u8 d2, d4 \n" |
| 516 |
| 517 "vtbl.u8 d3, {d0, d1, d2}, d28 \n" |
| 518 "vtbl.u8 d4, {d0, d1, d2}, d29 \n" |
| 519 |
| 520 MEMACCESS(1) |
| 521 "vst1.8 {d3}, [%1]! \n" |
| 522 MEMACCESS(1) |
| 523 "vst1.32 {d4[0]}, [%1]! \n" |
| 524 "bgt 1b \n" |
| 525 : "+r"(src_ptr), // %0 |
| 526 "+r"(dst_ptr), // %1 |
| 527 "+r"(dst_width), // %2 |
| 528 "+r"(src_stride) // %3 |
| 529 : "r"(&kMult38_Div6), // %4 |
| 530 "r"(&kShuf38_2) // %5 |
| 531 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" |
| 532 ); |
| 533 } |
| 534 #endif //HAS_SCALEROWDOWN38_NEON |
| 535 |
| 536 #if 0 |
| 537 // 16x2 -> 16x1 |
| 538 void ScaleFilterRows_NEON(uint8* dst_ptr, |
| 539 const uint8* src_ptr, ptrdiff_t src_stride, |
| 540 int dst_width, int source_y_fraction) { |
| 541 asm volatile ( |
| 542 "cmp %4, #0 \n" |
| 543 "beq 100f \n" |
| 544 "add %2, %1 \n" |
| 545 "cmp %4, #64 \n" |
| 546 "beq 75f \n" |
| 547 "cmp %4, #128 \n" |
| 548 "beq 50f \n" |
| 549 "cmp %4, #192 \n" |
| 550 "beq 25f \n" |
| 551 |
| 552 "vdup.8 d5, %4 \n" |
| 553 "rsb %4, #256 \n" |
| 554 "vdup.8 d4, %4 \n" |
| 555 // General purpose row blend. |
| 556 "1: \n" |
| 557 MEMACCESS(1) |
| 558 "vld1.8 {q0}, [%1]! \n" |
| 559 MEMACCESS(2) |
| 560 "vld1.8 {q1}, [%2]! \n" |
| 561 "subs %3, %3, #16 \n" |
| 562 "vmull.u8 q13, d0, d4 \n" |
| 563 "vmull.u8 q14, d1, d4 \n" |
| 564 "vmlal.u8 q13, d2, d5 \n" |
| 565 "vmlal.u8 q14, d3, d5 \n" |
| 566 "vrshrn.u16 d0, q13, #8 \n" |
| 567 "vrshrn.u16 d1, q14, #8 \n" |
| 568 MEMACCESS(0) |
| 569 "vst1.8 {q0}, [%0]! \n" |
| 570 "bgt 1b \n" |
| 571 "b 99f \n" |
| 572 |
| 573 // Blend 25 / 75. |
| 574 "25: \n" |
| 575 MEMACCESS(1) |
| 576 "vld1.8 {q0}, [%1]! \n" |
| 577 MEMACCESS(2) |
| 578 "vld1.8 {q1}, [%2]! \n" |
| 579 "subs %3, %3, #16 \n" |
| 580 "vrhadd.u8 q0, q1 \n" |
| 581 "vrhadd.u8 q0, q1 \n" |
| 582 MEMACCESS(0) |
| 583 "vst1.8 {q0}, [%0]! \n" |
| 584 "bgt 25b \n" |
| 585 "b 99f \n" |
| 586 |
| 587 // Blend 50 / 50. |
| 588 "50: \n" |
| 589 MEMACCESS(1) |
| 590 "vld1.8 {q0}, [%1]! \n" |
| 591 MEMACCESS(2) |
| 592 "vld1.8 {q1}, [%2]! \n" |
| 593 "subs %3, %3, #16 \n" |
| 594 "vrhadd.u8 q0, q1 \n" |
| 595 MEMACCESS(0) |
| 596 "vst1.8 {q0}, [%0]! \n" |
| 597 "bgt 50b \n" |
| 598 "b 99f \n" |
| 599 |
| 600 // Blend 75 / 25. |
| 601 "75: \n" |
| 602 MEMACCESS(1) |
| 603 "vld1.8 {q1}, [%1]! \n" |
| 604 MEMACCESS(2) |
| 605 "vld1.8 {q0}, [%2]! \n" |
| 606 "subs %3, %3, #16 \n" |
| 607 "vrhadd.u8 q0, q1 \n" |
| 608 "vrhadd.u8 q0, q1 \n" |
| 609 MEMACCESS(0) |
| 610 "vst1.8 {q0}, [%0]! \n" |
| 611 "bgt 75b \n" |
| 612 "b 99f \n" |
| 613 |
| 614 // Blend 100 / 0 - Copy row unchanged. |
| 615 "100: \n" |
| 616 MEMACCESS(1) |
| 617 "vld1.8 {q0}, [%1]! \n" |
| 618 "subs %3, %3, #16 \n" |
| 619 MEMACCESS(0) |
| 620 "vst1.8 {q0}, [%0]! \n" |
| 621 "bgt 100b \n" |
| 622 |
| 623 "99: \n" |
| 624 MEMACCESS(0) |
| 625 "vst1.8 {d1[7]}, [%0] \n" |
| 626 : "+r"(dst_ptr), // %0 |
| 627 "+r"(src_ptr), // %1 |
| 628 "+r"(src_stride), // %2 |
| 629 "+r"(dst_width), // %3 |
| 630 "+r"(source_y_fraction) // %4 |
| 631 : |
| 632 : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc" |
| 633 ); |
| 634 } |
| 635 #endif //0 |
| 636 |
| 637 #ifdef HAS_SCALEARGBROWDOWN2_NEON |
| 638 void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, |
| 639 uint8* dst, int dst_width) { |
| 640 asm volatile ( |
| 641 ".p2align 2 \n" |
| 642 "1: \n" |
| 643 // load even pixels into q0, odd into q1 |
| 644 MEMACCESS(0) |
| 645 "vld2.32 {q0, q1}, [%0]! \n" |
| 646 MEMACCESS(0) |
| 647 "vld2.32 {q2, q3}, [%0]! \n" |
| 648 "subs %2, %2, #8 \n" // 8 processed per loop |
| 649 MEMACCESS(1) |
| 650 "vst1.8 {q1}, [%1]! \n" // store odd pixels |
| 651 MEMACCESS(1) |
| 652 "vst1.8 {q3}, [%1]! \n" |
| 653 "bgt 1b \n" |
| 654 : "+r"(src_ptr), // %0 |
| 655 "+r"(dst), // %1 |
| 656 "+r"(dst_width) // %2 |
| 657 : |
| 658 : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List |
| 659 ); |
| 660 } |
| 661 #endif //HAS_SCALEARGBROWDOWN2_NEON |
| 662 |
| 663 #ifdef HAS_SCALEARGBROWDOWN2_NEON |
| 664 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, |
| 665 uint8* dst, int dst_width) { |
| 666 asm volatile ( |
| 667 // change the stride to row 2 pointer |
| 668 "add %1, %1, %0 \n" |
| 669 ".p2align 2 \n" |
| 670 "1: \n" |
| 671 MEMACCESS(0) |
| 672 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. |
| 673 MEMACCESS(0) |
| 674 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. |
| 675 "subs %3, %3, #8 \n" // 8 processed per loop. |
| 676 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. |
| 677 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. |
| 678 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. |
| 679 "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. |
| 680 MEMACCESS(1) |
| 681 "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels. |
| 682 MEMACCESS(1) |
| 683 "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels. |
| 684 "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. |
| 685 "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. |
| 686 "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. |
| 687 "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. |
| 688 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack |
| 689 "vrshrn.u16 d1, q1, #2 \n" |
| 690 "vrshrn.u16 d2, q2, #2 \n" |
| 691 "vrshrn.u16 d3, q3, #2 \n" |
| 692 MEMACCESS(2) |
| 693 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" |
| 694 "bgt 1b \n" |
| 695 : "+r"(src_ptr), // %0 |
| 696 "+r"(src_stride), // %1 |
| 697 "+r"(dst), // %2 |
| 698 "+r"(dst_width) // %3 |
| 699 : |
| 700 : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" |
| 701 ); |
| 702 } |
| 703 #endif //HAS_SCALEARGBROWDOWN2_NEON |
| 704 |
| 705 #ifdef HAS_SCALEARGBROWDOWNEVEN_NEON |
| 706 // Reads 4 pixels at a time. |
| 707 // Alignment requirement: src_argb 4 byte aligned. |
| 708 void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, |
| 709 int src_stepx, uint8* dst_argb, int dst_width) { |
| 710 asm volatile ( |
| 711 "mov r12, %3, lsl #2 \n" |
| 712 ".p2align 2 \n" |
| 713 "1: \n" |
| 714 MEMACCESS(0) |
| 715 "vld1.32 {d0[0]}, [%0], r12 \n" |
| 716 MEMACCESS(0) |
| 717 "vld1.32 {d0[1]}, [%0], r12 \n" |
| 718 MEMACCESS(0) |
| 719 "vld1.32 {d1[0]}, [%0], r12 \n" |
| 720 MEMACCESS(0) |
| 721 "vld1.32 {d1[1]}, [%0], r12 \n" |
| 722 "subs %2, %2, #4 \n" // 4 pixels per loop. |
| 723 MEMACCESS(1) |
| 724 "vst1.8 {q0}, [%1]! \n" |
| 725 "bgt 1b \n" |
| 726 : "+r"(src_argb), // %0 |
| 727 "+r"(dst_argb), // %1 |
| 728 "+r"(dst_width) // %2 |
| 729 : "r"(src_stepx) // %3 |
| 730 : "memory", "cc", "r12", "q0" |
| 731 ); |
| 732 } |
| 733 #endif //HAS_SCALEARGBROWDOWNEVEN_NEON |
| 734 |
| 735 #ifdef HAS_SCALEARGBROWDOWNEVEN_NEON |
| 736 // Reads 4 pixels at a time. |
| 737 // Alignment requirement: src_argb 4 byte aligned. |
| 738 void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, |
| 739 int src_stepx, |
| 740 uint8* dst_argb, int dst_width) { |
| 741 asm volatile ( |
| 742 "mov r12, %4, lsl #2 \n" |
| 743 "add %1, %1, %0 \n" |
| 744 ".p2align 2 \n" |
| 745 "1: \n" |
| 746 MEMACCESS(0) |
| 747 "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1 |
| 748 MEMACCESS(1) |
| 749 "vld1.8 {d1}, [%1], r12 \n" |
| 750 MEMACCESS(0) |
| 751 "vld1.8 {d2}, [%0], r12 \n" |
| 752 MEMACCESS(1) |
| 753 "vld1.8 {d3}, [%1], r12 \n" |
| 754 MEMACCESS(0) |
| 755 "vld1.8 {d4}, [%0], r12 \n" |
| 756 MEMACCESS(1) |
| 757 "vld1.8 {d5}, [%1], r12 \n" |
| 758 MEMACCESS(0) |
| 759 "vld1.8 {d6}, [%0], r12 \n" |
| 760 MEMACCESS(1) |
| 761 "vld1.8 {d7}, [%1], r12 \n" |
| 762 "vaddl.u8 q0, d0, d1 \n" |
| 763 "vaddl.u8 q1, d2, d3 \n" |
| 764 "vaddl.u8 q2, d4, d5 \n" |
| 765 "vaddl.u8 q3, d6, d7 \n" |
| 766 "vswp.8 d1, d2 \n" // ab_cd -> ac_bd |
| 767 "vswp.8 d5, d6 \n" // ef_gh -> eg_fh |
| 768 "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) |
| 769 "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) |
| 770 "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. |
| 771 "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. |
| 772 "subs %3, %3, #4 \n" // 4 pixels per loop. |
| 773 MEMACCESS(2) |
| 774 "vst1.8 {q0}, [%2]! \n" |
| 775 "bgt 1b \n" |
| 776 : "+r"(src_argb), // %0 |
| 777 "+r"(src_stride), // %1 |
| 778 "+r"(dst_argb), // %2 |
| 779 "+r"(dst_width) // %3 |
| 780 : "r"(src_stepx) // %4 |
| 781 : "memory", "cc", "r12", "q0", "q1", "q2", "q3" |
| 782 ); |
| 783 } |
| 784 #endif // HAS_SCALEARGBROWDOWNEVEN_NEON |
| 785 #endif // __aarch64__ |
| 786 |
| 787 #ifdef __cplusplus |
| 788 } // extern "C" |
| 789 } // namespace libyuv |
| 790 #endif |
OLD | NEW |