Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(13)

Unified Diff: source/libvpx/third_party/libyuv/source/row_neon.cc

Issue 478033002: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: source/libvpx/third_party/libyuv/source/row_neon.cc
===================================================================
--- source/libvpx/third_party/libyuv/source/row_neon.cc (revision 290053)
+++ source/libvpx/third_party/libyuv/source/row_neon.cc (working copy)
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "third_party/libyuv/include/libyuv/row.h"
+#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
@@ -20,34 +20,46 @@
// Read 8 Y, 4 U and 4 V from 422
#define READYUV422 \
+ MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \
+ MEMACCESS(1) \
"vld1.32 {d2[0]}, [%1]! \n" \
+ MEMACCESS(2) \
"vld1.32 {d2[1]}, [%2]! \n"
// Read 8 Y, 2 U and 2 V from 422
#define READYUV411 \
+ MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \
+ MEMACCESS(1) \
"vld1.16 {d2[0]}, [%1]! \n" \
+ MEMACCESS(2) \
"vld1.16 {d2[1]}, [%2]! \n" \
"vmov.u8 d3, d2 \n" \
"vzip.u8 d2, d3 \n"
// Read 8 Y, 8 U and 8 V from 444
#define READYUV444 \
+ MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \
+ MEMACCESS(1) \
"vld1.8 {d2}, [%1]! \n" \
+ MEMACCESS(2) \
"vld1.8 {d3}, [%2]! \n" \
"vpaddl.u8 q1, q1 \n" \
"vrshrn.u16 d2, q1, #1 \n"
// Read 8 Y, and set 4 U and 4 V to 128
#define READYUV400 \
+ MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \
"vmov.u8 d2, #128 \n"
// Read 8 Y and 4 UV from NV12
#define READNV12 \
+ MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \
+ MEMACCESS(1) \
"vld1.8 {d2}, [%1]! \n" \
"vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
"vuzp.u8 d2, d3 \n" \
@@ -55,7 +67,9 @@
// Read 8 Y and 4 VU from NV21
#define READNV21 \
+ MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \
+ MEMACCESS(1) \
"vld1.8 {d2}, [%1]! \n" \
"vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
"vuzp.u8 d3, d2 \n" \
@@ -63,6 +77,7 @@
// Read 8 YUY2
#define READYUY2 \
+ MEMACCESS(0) \
"vld2.8 {d0, d2}, [%0]! \n" \
"vmov.u8 d3, d2 \n" \
"vuzp.u8 d2, d3 \n" \
@@ -70,6 +85,7 @@
// Read 8 UYVY
#define READUYVY \
+ MEMACCESS(0) \
"vld2.8 {d2, d3}, [%0]! \n" \
"vmov.u8 d0, d3 \n" \
"vmov.u8 d3, d2 \n" \
@@ -113,7 +129,9 @@
uint8* dst_argb,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -124,6 +142,7 @@
YUV422TORGB
"subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n"
+ MEMACCESS(3)
"vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -144,7 +163,9 @@
uint8* dst_argb,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -155,6 +176,7 @@
YUV422TORGB
"subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n"
+ MEMACCESS(3)
"vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -175,7 +197,9 @@
uint8* dst_argb,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -186,6 +210,7 @@
YUV422TORGB
"subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n"
+ MEMACCESS(3)
"vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -206,7 +231,9 @@
uint8* dst_bgra,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -218,6 +245,7 @@
"subs %4, %4, #8 \n"
"vswp.u8 d20, d22 \n"
"vmov.u8 d19, #255 \n"
+ MEMACCESS(3)
"vst4.8 {d19, d20, d21, d22}, [%3]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -238,7 +266,9 @@
uint8* dst_abgr,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -250,6 +280,7 @@
"subs %4, %4, #8 \n"
"vswp.u8 d20, d22 \n"
"vmov.u8 d23, #255 \n"
+ MEMACCESS(3)
"vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -270,7 +301,9 @@
uint8* dst_rgba,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -281,6 +314,7 @@
YUV422TORGB
"subs %4, %4, #8 \n"
"vmov.u8 d19, #255 \n"
+ MEMACCESS(3)
"vst4.8 {d19, d20, d21, d22}, [%3]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -301,7 +335,9 @@
uint8* dst_rgb24,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -311,6 +347,7 @@
READYUV422
YUV422TORGB
"subs %4, %4, #8 \n"
+ MEMACCESS(3)
"vst3.8 {d20, d21, d22}, [%3]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -331,7 +368,9 @@
uint8* dst_raw,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -342,6 +381,7 @@
YUV422TORGB
"subs %4, %4, #8 \n"
"vswp.u8 d20, d22 \n"
+ MEMACCESS(3)
"vst3.8 {d20, d21, d22}, [%3]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -374,7 +414,9 @@
uint8* dst_rgb565,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -385,6 +427,7 @@
YUV422TORGB
"subs %4, %4, #8 \n"
ARGBTORGB565
+ MEMACCESS(3)
"vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -420,7 +463,9 @@
uint8* dst_argb1555,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -432,6 +477,7 @@
"subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n"
ARGBTOARGB1555
+ MEMACCESS(3)
"vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555.
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -461,7 +507,9 @@
uint8* dst_argb4444,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -474,6 +522,7 @@
"subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n"
ARGBTOARGB4444
+ MEMACCESS(3)
"vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444.
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -492,7 +541,9 @@
uint8* dst_argb,
int width) {
asm volatile (
+ MEMACCESS(3)
"vld1.8 {d24}, [%3] \n"
+ MEMACCESS(4)
"vld1.8 {d25}, [%4] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -503,6 +554,7 @@
YUV422TORGB
"subs %2, %2, #8 \n"
"vmov.u8 d23, #255 \n"
+ MEMACCESS(1)
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -522,10 +574,12 @@
".p2align 2 \n"
"vmov.u8 d23, #255 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {d20}, [%0]! \n"
"vmov d21, d20 \n"
"vmov d22, d20 \n"
"subs %2, %2, #8 \n"
+ MEMACCESS(1)
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -541,7 +595,9 @@
uint8* dst_argb,
int width) {
asm volatile (
+ MEMACCESS(4)
"vld1.8 {d24}, [%4] \n"
+ MEMACCESS(5)
"vld1.8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -552,6 +608,7 @@
YUV422TORGB
"subs %3, %3, #8 \n"
"vmov.u8 d23, #255 \n"
+ MEMACCESS(2)
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -570,7 +627,9 @@
uint8* dst_argb,
int width) {
asm volatile (
+ MEMACCESS(4)
"vld1.8 {d24}, [%4] \n"
+ MEMACCESS(5)
"vld1.8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -581,6 +640,7 @@
YUV422TORGB
"subs %3, %3, #8 \n"
"vmov.u8 d23, #255 \n"
+ MEMACCESS(2)
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -599,7 +659,9 @@
uint8* dst_rgb565,
int width) {
asm volatile (
+ MEMACCESS(4)
"vld1.8 {d24}, [%4] \n"
+ MEMACCESS(5)
"vld1.8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -610,6 +672,7 @@
YUV422TORGB
"subs %3, %3, #8 \n"
ARGBTORGB565
+ MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -628,7 +691,9 @@
uint8* dst_rgb565,
int width) {
asm volatile (
+ MEMACCESS(4)
"vld1.8 {d24}, [%4] \n"
+ MEMACCESS(5)
"vld1.8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -639,6 +704,7 @@
YUV422TORGB
"subs %3, %3, #8 \n"
ARGBTORGB565
+ MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -656,7 +722,9 @@
uint8* dst_argb,
int width) {
asm volatile (
+ MEMACCESS(3)
"vld1.8 {d24}, [%3] \n"
+ MEMACCESS(4)
"vld1.8 {d25}, [%4] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -667,6 +735,7 @@
YUV422TORGB
"subs %2, %2, #8 \n"
"vmov.u8 d23, #255 \n"
+ MEMACCESS(1)
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_yuy2), // %0
@@ -683,7 +752,9 @@
uint8* dst_argb,
int width) {
asm volatile (
+ MEMACCESS(3)
"vld1.8 {d24}, [%3] \n"
+ MEMACCESS(4)
"vld1.8 {d25}, [%4] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -694,6 +765,7 @@
YUV422TORGB
"subs %2, %2, #8 \n"
"vmov.u8 d23, #255 \n"
+ MEMACCESS(1)
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_uyvy), // %0
@@ -712,9 +784,12 @@
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
"subs %3, %3, #16 \n" // 16 processed per loop
+ MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" // store U
+ MEMACCESS(2)
"vst1.8 {q1}, [%2]! \n" // store V
"bgt 1b \n"
: "+r"(src_uv), // %0
@@ -732,9 +807,12 @@
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load U
+ MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n" // load V
"subs %3, %3, #16 \n" // 16 processed per loop
+ MEMACCESS(2)
"vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
"bgt 1b \n"
:
@@ -752,8 +830,10 @@
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
"subs %2, %2, #32 \n" // 32 processed per loop
+ MEMACCESS(1)
"vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
"bgt 1b \n"
: "+r"(src), // %0
@@ -770,6 +850,7 @@
"vdup.u32 q0, %2 \n" // duplicate 4 ints
"1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop
+ MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" // store
"bgt 1b \n"
: "+r"(dst), // %0
@@ -798,10 +879,13 @@
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0], r3 \n" // src -= 16
"subs %2, #16 \n" // 16 pixels per loop.
"vrev64.8 q0, q0 \n"
+ MEMACCESS(1)
"vst1.8 {d1}, [%1]! \n" // dst += 16
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n"
"bgt 1b \n"
: "+r"(src), // %0
@@ -822,10 +906,13 @@
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
"subs %3, #8 \n" // 8 pixels per loop.
"vrev64.8 q0, q0 \n"
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // dst += 8
+ MEMACCESS(2)
"vst1.8 {d1}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_uv), // %0
@@ -846,10 +933,13 @@
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0], r3 \n" // src -= 16
"subs %2, #4 \n" // 4 pixels per loop.
"vrev64.32 q0, q0 \n"
+ MEMACCESS(1)
"vst1.8 {d1}, [%1]! \n" // dst += 16
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n"
"bgt 1b \n"
: "+r"(src), // %0
@@ -865,8 +955,10 @@
"vmov.u8 d4, #255 \n" // Alpha
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop.
+ MEMACCESS(1)
"vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_rgb24), // %0
@@ -882,9 +974,11 @@
"vmov.u8 d4, #255 \n" // Alpha
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vswp.u8 d1, d3 \n" // swap R, B
+ MEMACCESS(1)
"vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_raw), // %0
@@ -912,9 +1006,11 @@
"vmov.u8 d3, #255 \n" // Alpha
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
RGB565TOARGB
+ MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_rgb565), // %0
@@ -958,9 +1054,11 @@
"vmov.u8 d3, #255 \n" // Alpha
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
+ MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_argb1555), // %0
@@ -987,9 +1085,11 @@
"vmov.u8 d3, #255 \n" // Alpha
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
+ MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_argb4444), // %0
@@ -1004,8 +1104,10 @@
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
+ MEMACCESS(1)
"vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1020,9 +1122,11 @@
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vswp.u8 d1, d3 \n" // swap R, B
+ MEMACCESS(1)
"vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1037,8 +1141,10 @@
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
"subs %2, %2, #16 \n" // 16 processed per loop.
+ MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
"bgt 1b \n"
: "+r"(src_yuy2), // %0
@@ -1053,8 +1159,10 @@
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
"subs %2, %2, #16 \n" // 16 processed per loop.
+ MEMACCESS(1)
"vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
"bgt 1b \n"
: "+r"(src_uyvy), // %0
@@ -1070,9 +1178,12 @@
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ MEMACCESS(1)
"vst1.8 {d1}, [%1]! \n" // store 8 U.
+ MEMACCESS(2)
"vst1.8 {d3}, [%2]! \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_yuy2), // %0
@@ -1089,9 +1200,12 @@
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 U.
+ MEMACCESS(2)
"vst1.8 {d2}, [%2]! \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_uyvy), // %0
@@ -1109,12 +1223,16 @@
"add %1, %0, %1 \n" // stride + src_yuy2
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ MEMACCESS(1)
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
"vrhadd.u8 d1, d1, d5 \n" // average rows of U
"vrhadd.u8 d3, d3, d7 \n" // average rows of V
+ MEMACCESS(2)
"vst1.8 {d1}, [%2]! \n" // store 8 U.
+ MEMACCESS(3)
"vst1.8 {d3}, [%3]! \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_yuy2), // %0
@@ -1133,12 +1251,16 @@
"add %1, %0, %1 \n" // stride + src_uyvy
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ MEMACCESS(1)
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
"vrhadd.u8 d0, d0, d4 \n" // average rows of U
"vrhadd.u8 d2, d2, d6 \n" // average rows of V
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 U.
+ MEMACCESS(3)
"vst1.8 {d2}, [%3]! \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_uyvy), // %0
@@ -1157,10 +1279,13 @@
// change the stride to row 2 pointer
"add %1, %0 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels.
"subs %3, %3, #16 \n" // 16 processed per loop
+ MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels.
"vrhadd.u8 q0, q1 \n" // average row 1 and 2
+ MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_uv), // %0
@@ -1178,11 +1303,13 @@
asm volatile (
"vmov.u32 d6[0], %3 \n" // selector
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop
"vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels
"vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels
"vtrn.u32 d4, d5 \n" // combine 8 pixels
+ MEMACCESS(1)
"vst1.8 {d4}, [%1]! \n" // store 8.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1198,8 +1325,10 @@
uint32 /*selector*/, int pix) {
asm volatile (
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop
+ MEMACCESS(1)
"vst1.8 {d1}, [%1]! \n" // store 8 G's.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1214,12 +1343,15 @@
void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) {
asm volatile (
+ MEMACCESS(3)
"vld1.8 {q2}, [%3] \n" // shuffler
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 4 pixels.
"subs %2, %2, #4 \n" // 4 processed per loop
"vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
"vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
+ MEMACCESS(1)
"vst1.8 {q1}, [%1]! \n" // store 4.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1237,10 +1369,14 @@
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
+ MEMACCESS(1)
"vld1.8 {d1}, [%1]! \n" // load 8 Us
+ MEMACCESS(2)
"vld1.8 {d3}, [%2]! \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels
+ MEMACCESS(3)
"vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -1260,10 +1396,14 @@
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
+ MEMACCESS(1)
"vld1.8 {d0}, [%1]! \n" // load 8 Us
+ MEMACCESS(2)
"vld1.8 {d2}, [%2]! \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels
+ MEMACCESS(3)
"vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -1280,9 +1420,11 @@
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTORGB565
+ MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1298,9 +1440,11 @@
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTOARGB1555
+ MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1317,9 +1461,11 @@
"vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTOARGB4444
+ MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1338,6 +1484,7 @@
"vmov.u8 d27, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B
@@ -1345,6 +1492,7 @@
"vmlal.u8 q2, d2, d26 \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d27 \n"
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1362,12 +1510,14 @@
"vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B
"vmlal.u8 q2, d1, d25 \n" // G
"vmlal.u8 q2, d2, d26 \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1390,6 +1540,7 @@
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B
@@ -1405,7 +1556,9 @@
"vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ MEMACCESS(2)
"vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1429,7 +1582,9 @@
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
@@ -1450,7 +1605,9 @@
"vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ MEMACCESS(2)
"vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1475,12 +1632,16 @@
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(0)
"vld4.8 {d8, d10, d12, d14}, [%0]! \n" // load 8 more ARGB pixels.
+ MEMACCESS(0)
"vld4.8 {d9, d11, d13, d15}, [%0]! \n" // load last 8 ARGB pixels.
"vpaddl.u8 q4, q4 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q5, q5 \n" // G 16 bytes -> 8 shorts.
@@ -1508,7 +1669,9 @@
"vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
"vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ MEMACCESS(2)
"vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1547,12 +1710,16 @@
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
+ MEMACCESS(1)
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
"vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
@@ -1564,7 +1731,9 @@
"subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q0, q1, q2)
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1591,12 +1760,16 @@
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
+ MEMACCESS(1)
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
"vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
@@ -1608,7 +1781,9 @@
"subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q0, q1, q2)
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1634,12 +1809,16 @@
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
+ MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
"vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
+ MEMACCESS(1)
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
"vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
@@ -1651,7 +1830,9 @@
"subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q3, q2, q1)
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_bgra), // %0
@@ -1677,12 +1858,16 @@
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
+ MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
"vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
+ MEMACCESS(1)
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
"vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
@@ -1694,7 +1879,9 @@
"subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q2, q1, q0)
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_abgr), // %0
@@ -1720,12 +1907,16 @@
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
+ MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
"vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
+ MEMACCESS(1)
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
"vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
@@ -1737,7 +1928,9 @@
"subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q0, q1, q2)
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_rgba), // %0
@@ -1763,12 +1956,16 @@
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
+ MEMACCESS(0)
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
"vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
+ MEMACCESS(1)
"vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
"vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
@@ -1780,7 +1977,9 @@
"subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q0, q1, q2)
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_rgb24), // %0
@@ -1806,12 +2005,16 @@
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
+ MEMACCESS(0)
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
"vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
"vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
+ MEMACCESS(1)
"vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
"vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
@@ -1823,7 +2026,9 @@
"subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q2, q1, q0)
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_raw), // %0
@@ -1850,22 +2055,26 @@
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
RGB565TOARGB
"vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
"vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
"vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
RGB565TOARGB
"vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
"vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
"vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
RGB565TOARGB
"vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
"vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
"vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
RGB565TOARGB
"vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
@@ -1887,7 +2096,9 @@
"vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
"vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_rgb565), // %0
@@ -1914,22 +2125,26 @@
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
RGB555TOARGB
"vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
"vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
"vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
RGB555TOARGB
"vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
"vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
"vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
RGB555TOARGB
"vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
"vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
"vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
RGB555TOARGB
"vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
@@ -1951,7 +2166,9 @@
"vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
"vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_argb1555), // %0
@@ -1978,22 +2195,26 @@
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
ARGB4444TOARGB
"vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
"vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
"vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
ARGB4444TOARGB
"vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
"vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
"vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
ARGB4444TOARGB
"vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
"vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
"vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
ARGB4444TOARGB
"vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
@@ -2015,7 +2236,9 @@
"vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
"vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_argb4444), // %0
@@ -2037,6 +2260,7 @@
"vmov.u8 d27, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
RGB565TOARGB
@@ -2045,6 +2269,7 @@
"vmlal.u8 q2, d2, d26 \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d27 \n"
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_rgb565), // %0
@@ -2063,6 +2288,7 @@
"vmov.u8 d27, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
@@ -2071,6 +2297,7 @@
"vmlal.u8 q2, d2, d26 \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d27 \n"
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_argb1555), // %0
@@ -2089,6 +2316,7 @@
"vmov.u8 d27, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
@@ -2097,6 +2325,7 @@
"vmlal.u8 q2, d2, d26 \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d27 \n"
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_argb4444), // %0
@@ -2115,6 +2344,7 @@
"vmov.u8 d7, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d1, d4 \n" // R
@@ -2122,6 +2352,7 @@
"vmlal.u8 q8, d3, d6 \n" // B
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n"
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_bgra), // %0
@@ -2140,6 +2371,7 @@
"vmov.u8 d7, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d0, d4 \n" // R
@@ -2147,6 +2379,7 @@
"vmlal.u8 q8, d2, d6 \n" // B
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n"
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_abgr), // %0
@@ -2165,6 +2398,7 @@
"vmov.u8 d7, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d1, d4 \n" // B
@@ -2172,6 +2406,7 @@
"vmlal.u8 q8, d3, d6 \n" // R
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n"
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_rgba), // %0
@@ -2190,6 +2425,7 @@
"vmov.u8 d7, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d0, d4 \n" // B
@@ -2197,6 +2433,7 @@
"vmlal.u8 q8, d2, d6 \n" // R
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n"
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_rgb24), // %0
@@ -2215,6 +2452,7 @@
"vmov.u8 d7, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d0, d4 \n" // B
@@ -2222,6 +2460,7 @@
"vmlal.u8 q8, d2, d6 \n" // R
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n"
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_raw), // %0
@@ -2252,7 +2491,9 @@
"vdup.8 d4, %4 \n"
// General purpose row blend.
"1: \n"
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n"
+ MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
"vmull.u8 q13, d0, d4 \n"
@@ -2261,46 +2502,58 @@
"vmlal.u8 q14, d3, d5 \n"
"vrshrn.u16 d0, q13, #8 \n"
"vrshrn.u16 d1, q14, #8 \n"
+ MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 1b \n"
"b 99f \n"
// Blend 25 / 75.
"25: \n"
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n"
+ MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
+ MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 25b \n"
"b 99f \n"
// Blend 50 / 50.
"50: \n"
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n"
+ MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
+ MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 50b \n"
"b 99f \n"
// Blend 75 / 25.
"75: \n"
+ MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n"
+ MEMACCESS(2)
"vld1.8 {q0}, [%2]! \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
+ MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 75b \n"
"b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n"
"subs %3, %3, #16 \n"
+ MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 100b \n"
@@ -2323,7 +2576,9 @@
"blt 89f \n"
// Blend 8 pixels.
"8: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
+ MEMACCESS(1)
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vmull.u8 q10, d4, d3 \n" // db * a
@@ -2337,6 +2592,7 @@
"vqadd.u8 q0, q0, q2 \n" // + sbg
"vqadd.u8 d2, d2, d6 \n" // + sr
"vmov.u8 d3, #255 \n" // a = 255
+ MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
"bge 8b \n"
@@ -2346,7 +2602,9 @@
// Blend 1 pixels.
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
+ MEMACCESS(1)
"vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
"subs %3, %3, #1 \n" // 1 processed per loop.
"vmull.u8 q10, d4, d3 \n" // db * a
@@ -2360,6 +2618,7 @@
"vqadd.u8 q0, q0, q2 \n" // + sbg
"vqadd.u8 d2, d2, d6 \n" // + sr
"vmov.u8 d3, #255 \n" // a = 255
+ MEMACCESS(2)
"vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
"bge 1b \n"
@@ -2379,6 +2638,7 @@
asm volatile (
// Attenuate 8 pixels.
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q10, d0, d3 \n" // b * a
@@ -2387,6 +2647,7 @@
"vqrshrn.u16 d0, q10, #8 \n" // b >>= 8
"vqrshrn.u16 d1, q11, #8 \n" // g >>= 8
"vqrshrn.u16 d2, q12, #8 \n" // r >>= 8
+ MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -2410,6 +2671,7 @@
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
"subs %1, %1, #8 \n" // 8 processed per loop.
"vmovl.u8 q0, d0 \n" // b (0 .. 255)
@@ -2427,6 +2689,7 @@
"vqmovn.u16 d0, q0 \n"
"vqmovn.u16 d2, q1 \n"
"vqmovn.u16 d4, q2 \n"
+ MEMACCESS(0)
"vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(dst_argb), // %0
@@ -2451,6 +2714,7 @@
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmovl.u8 q10, d20 \n" // b (0 .. 255)
@@ -2465,6 +2729,7 @@
"vqmovn.u16 d22, q11 \n"
"vqmovn.u16 d24, q12 \n"
"vqmovn.u16 d26, q13 \n"
+ MEMACCESS(1)
"vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -2485,6 +2750,7 @@
"vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B
@@ -2493,6 +2759,7 @@
"vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B
"vmov d1, d0 \n" // G
"vmov d2, d0 \n" // R
+ MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -2520,6 +2787,7 @@
"vmov.u8 d30, #50 \n" // BR coefficient
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
"subs %1, %1, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d20 \n" // B to Sepia B
@@ -2534,6 +2802,7 @@
"vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
"vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
"vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
+ MEMACCESS(0)
"vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(dst_argb), // %0
@@ -2550,12 +2819,14 @@
void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
const int8* matrix_argb, int width) {
asm volatile (
+ MEMACCESS(3)
"vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
"vmovl.s8 q0, d4 \n" // B,G coefficients s16.
"vmovl.s8 q1, d5 \n" // R,A coefficients s16.
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
@@ -2594,6 +2865,7 @@
"vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G
"vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R
"vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A
+ MEMACCESS(1)
"vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -2614,7 +2886,9 @@
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(1)
"vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vmull.u8 q0, d0, d1 \n" // multiply B
@@ -2625,6 +2899,7 @@
"vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
"vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
"vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
+ MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
@@ -2645,11 +2920,14 @@
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(1)
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 q0, q0, q2 \n" // add B, G
"vqadd.u8 q1, q1, q3 \n" // add R, A
+ MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
@@ -2669,11 +2947,14 @@
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(1)
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vqsub.u8 q0, q0, q2 \n" // subtract B, G
"vqsub.u8 q1, q1, q3 \n" // subtract R, A
+ MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
@@ -2698,12 +2979,15 @@
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
+ MEMACCESS(1)
"vld1.8 {d1}, [%1]! \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 d0, d0, d1 \n" // add
"vmov.u8 d1, d0 \n"
"vmov.u8 d2, d0 \n"
+ MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_sobelx), // %0
@@ -2722,10 +3006,13 @@
// 16 pixel loop.
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
+ MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n" // load 16 sobely.
"subs %3, %3, #16 \n" // 16 processed per loop.
"vqadd.u8 q0, q0, q1 \n" // add
+ MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n" // store 16 pixels.
"bgt 1b \n"
: "+r"(src_sobelx), // %0
@@ -2749,10 +3036,13 @@
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
+ MEMACCESS(1)
"vld1.8 {d0}, [%1]! \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 d1, d0, d2 \n" // add
+ MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_sobelx), // %0
@@ -2773,21 +3063,28 @@
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {d0}, [%0],%5 \n" // top
+ MEMACCESS(0)
"vld1.8 {d1}, [%0],%6 \n"
"vsubl.u8 q0, d0, d1 \n"
+ MEMACCESS(1)
"vld1.8 {d2}, [%1],%5 \n" // center * 2
+ MEMACCESS(1)
"vld1.8 {d3}, [%1],%6 \n"
"vsubl.u8 q1, d2, d3 \n"
"vadd.s16 q0, q0, q1 \n"
"vadd.s16 q0, q0, q1 \n"
+ MEMACCESS(2)
"vld1.8 {d2}, [%2],%5 \n" // bottom
+ MEMACCESS(2)
"vld1.8 {d3}, [%2],%6 \n"
"subs %4, %4, #8 \n" // 8 pixels
"vsubl.u8 q1, d2, d3 \n"
"vadd.s16 q0, q0, q1 \n"
"vabs.s16 q0, q0 \n"
"vqmovn.u16 d0, q0 \n"
+ MEMACCESS(3)
"vst1.8 {d0}, [%3]! \n" // store 8 sobelx
"bgt 1b \n"
: "+r"(src_y0), // %0
@@ -2810,21 +3107,28 @@
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {d0}, [%0],%4 \n" // left
+ MEMACCESS(1)
"vld1.8 {d1}, [%1],%4 \n"
"vsubl.u8 q0, d0, d1 \n"
+ MEMACCESS(0)
"vld1.8 {d2}, [%0],%4 \n" // center * 2
+ MEMACCESS(1)
"vld1.8 {d3}, [%1],%4 \n"
"vsubl.u8 q1, d2, d3 \n"
"vadd.s16 q0, q0, q1 \n"
"vadd.s16 q0, q0, q1 \n"
+ MEMACCESS(0)
"vld1.8 {d2}, [%0],%5 \n" // right
+ MEMACCESS(1)
"vld1.8 {d3}, [%1],%5 \n"
"subs %3, %3, #8 \n" // 8 pixels
"vsubl.u8 q1, d2, d3 \n"
"vadd.s16 q0, q0, q1 \n"
"vabs.s16 q0, q0 \n"
"vqmovn.u16 d0, q0 \n"
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 sobely
"bgt 1b \n"
: "+r"(src_y0), // %0
« no previous file with comments | « source/libvpx/third_party/libyuv/source/row_mips.cc ('k') | source/libvpx/third_party/libyuv/source/row_neon64.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698