| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include "libyuv/row.h" | 11 #include "libyuv/row.h" |
| 12 #include "libyuv/rotate_row.h" | 12 #include "libyuv/rotate_row.h" |
| 13 | 13 |
| 14 #include "libyuv/basic_types.h" | 14 #include "libyuv/basic_types.h" |
| 15 | 15 |
| 16 #ifdef __cplusplus | 16 #ifdef __cplusplus |
| 17 namespace libyuv { | 17 namespace libyuv { |
| 18 extern "C" { | 18 extern "C" { |
| 19 #endif | 19 #endif |
| 20 | 20 |
| 21 // This module is for GCC Neon armv8 64 bit. | 21 // This module is for GCC Neon armv8 64 bit. |
| 22 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) | 22 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) |
| 23 | 23 |
| 24 static uvec8 kVTbl4x4Transpose = | 24 static uvec8 kVTbl4x4Transpose = |
| 25 { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; | 25 { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; |
| 26 | 26 |
| 27 void TransposeWx8_NEON(const uint8* src, int src_stride, | 27 void TransposeWx8_NEON(const uint8* src, int src_stride, |
| 28 uint8* dst, int dst_stride, int width) { | 28 uint8* dst, int dst_stride, int width) { |
| 29 const uint8* src_temp = NULL; | 29 const uint8* src_temp; |
| 30 int64 width64 = (int64) width; // Work around clang 3.4 warning. | 30 int64 width64 = (int64) width; // Work around clang 3.4 warning. |
| 31 asm volatile ( | 31 asm volatile ( |
| 32 // loops are on blocks of 8. loop will stop when | 32 // loops are on blocks of 8. loop will stop when |
| 33 // counter gets to or below 0. starting the counter | 33 // counter gets to or below 0. starting the counter |
| 34 // at w-8 allow for this | 34 // at w-8 allow for this |
| 35 "sub %3, %3, #8 \n" | 35 "sub %3, %3, #8 \n" |
| 36 | 36 |
| 37 // handle 8x8 blocks. this should be the majority of the plane | 37 // handle 8x8 blocks. this should be the majority of the plane |
| 38 "1: \n" | 38 "1: \n" |
| 39 "mov %0, %1 \n" | 39 "mov %0, %1 \n" |
| (...skipping 188 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 228 MEMACCESS(1) | 228 MEMACCESS(1) |
| 229 "ld1 {v0.b}[6], [%1], %5 \n" | 229 "ld1 {v0.b}[6], [%1], %5 \n" |
| 230 MEMACCESS(1) | 230 MEMACCESS(1) |
| 231 "ld1 {v0.b}[7], [%1] \n" | 231 "ld1 {v0.b}[7], [%1] \n" |
| 232 | 232 |
| 233 MEMACCESS(2) | 233 MEMACCESS(2) |
| 234 "st1 {v0.8b}, [%2] \n" | 234 "st1 {v0.8b}, [%2] \n" |
| 235 | 235 |
| 236 "4: \n" | 236 "4: \n" |
| 237 | 237 |
| 238 : "+r"(src_temp), // %0 | 238 : "=&r"(src_temp), // %0 |
| 239 "+r"(src), // %1 | 239 "+r"(src), // %1 |
| 240 "+r"(dst), // %2 | 240 "+r"(dst), // %2 |
| 241 "+r"(width64) // %3 | 241 "+r"(width64) // %3 |
| 242 : "r"(&kVTbl4x4Transpose), // %4 | 242 : "r"(&kVTbl4x4Transpose), // %4 |
| 243 "r"(static_cast<ptrdiff_t>(src_stride)), // %5 | 243 "r"(static_cast<ptrdiff_t>(src_stride)), // %5 |
| 244 "r"(static_cast<ptrdiff_t>(dst_stride)) // %6 | 244 "r"(static_cast<ptrdiff_t>(dst_stride)) // %6 |
| 245 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", | 245 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", |
| 246 "v17", "v18", "v19", "v20", "v21", "v22", "v23" | 246 "v17", "v18", "v19", "v20", "v21", "v22", "v23" |
| 247 ); | 247 ); |
| 248 } | 248 } |
| 249 | 249 |
| 250 static uint8 kVTbl4x4TransposeDi[32] = | 250 static uint8 kVTbl4x4TransposeDi[32] = |
| 251 { 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54, | 251 { 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54, |
| 252 1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55}; | 252 1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55}; |
| 253 | 253 |
| 254 void TransposeUVWx8_NEON(const uint8* src, int src_stride, | 254 void TransposeUVWx8_NEON(const uint8* src, int src_stride, |
| 255 uint8* dst_a, int dst_stride_a, | 255 uint8* dst_a, int dst_stride_a, |
| 256 uint8* dst_b, int dst_stride_b, | 256 uint8* dst_b, int dst_stride_b, |
| 257 int width) { | 257 int width) { |
| 258 const uint8* src_temp = NULL; | 258 const uint8* src_temp; |
| 259 int64 width64 = (int64) width; // Work around clang 3.4 warning. | 259 int64 width64 = (int64) width; // Work around clang 3.4 warning. |
| 260 asm volatile ( | 260 asm volatile ( |
| 261 // loops are on blocks of 8. loop will stop when | 261 // loops are on blocks of 8. loop will stop when |
| 262 // counter gets to or below 0. starting the counter | 262 // counter gets to or below 0. starting the counter |
| 263 // at w-8 allow for this | 263 // at w-8 allow for this |
| 264 "sub %4, %4, #8 \n" | 264 "sub %4, %4, #8 \n" |
| 265 | 265 |
| 266 // handle 8x8 blocks. this should be the majority of the plane | 266 // handle 8x8 blocks. this should be the majority of the plane |
| 267 "1: \n" | 267 "1: \n" |
| 268 "mov %0, %1 \n" | 268 "mov %0, %1 \n" |
| (...skipping 244 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 513 MEMACCESS(1) | 513 MEMACCESS(1) |
| 514 "ld2 {v0.b, v1.b}[7], [%1] \n" | 514 "ld2 {v0.b, v1.b}[7], [%1] \n" |
| 515 | 515 |
| 516 MEMACCESS(2) | 516 MEMACCESS(2) |
| 517 "st1 {v0.d}[0], [%2] \n" | 517 "st1 {v0.d}[0], [%2] \n" |
| 518 MEMACCESS(3) | 518 MEMACCESS(3) |
| 519 "st1 {v1.d}[0], [%3] \n" | 519 "st1 {v1.d}[0], [%3] \n" |
| 520 | 520 |
| 521 "4: \n" | 521 "4: \n" |
| 522 | 522 |
| 523 : "+r"(src_temp), // %0 | 523 : "=&r"(src_temp), // %0 |
| 524 "+r"(src), // %1 | 524 "+r"(src), // %1 |
| 525 "+r"(dst_a), // %2 | 525 "+r"(dst_a), // %2 |
| 526 "+r"(dst_b), // %3 | 526 "+r"(dst_b), // %3 |
| 527 "+r"(width64) // %4 | 527 "+r"(width64) // %4 |
| 528 : "r"(static_cast<ptrdiff_t>(src_stride)), // %5 | 528 : "r"(static_cast<ptrdiff_t>(src_stride)), // %5 |
| 529 "r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6 | 529 "r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6 |
| 530 "r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7 | 530 "r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7 |
| 531 "r"(&kVTbl4x4TransposeDi) // %8 | 531 "r"(&kVTbl4x4TransposeDi) // %8 |
| 532 : "memory", "cc", | 532 : "memory", "cc", |
| 533 "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 533 "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
| 534 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", | 534 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", |
| 535 "v30", "v31" | 535 "v30", "v31" |
| 536 ); | 536 ); |
| 537 } | 537 } |
| 538 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) | 538 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) |
| 539 | 539 |
| 540 #ifdef __cplusplus | 540 #ifdef __cplusplus |
| 541 } // extern "C" | 541 } // extern "C" |
| 542 } // namespace libyuv | 542 } // namespace libyuv |
| 543 #endif | 543 #endif |
| OLD | NEW |