Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(20)

Unified Diff: source/libvpx/third_party/libyuv/source/row_neon64.cc

Issue 554673004: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: source/libvpx/third_party/libyuv/source/row_neon64.cc
===================================================================
--- source/libvpx/third_party/libyuv/source/row_neon64.cc (revision 291857)
+++ source/libvpx/third_party/libyuv/source/row_neon64.cc (working copy)
@@ -824,19 +824,19 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
+ "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pairs of UV
"subs %3, %3, #16 \n" // 16 processed per loop
MEMACCESS(1)
- "vst1.8 {q0}, [%1]! \n" // store U
+ "st1 {v0.16b}, [%1], #16 \n" // store U
MEMACCESS(2)
- "vst1.8 {q1}, [%2]! \n" // store V
+ "st1 {v1.16b}, [%2], #16 \n" // store V
"bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width) // %3 // Output registers
: // Input registers
- : "cc", "memory", "q0", "q1" // Clobber List
+ : "cc", "memory", "v0", "v1" // Clobber List
);
}
#endif // HAS_SPLITUVROW_NEON
@@ -849,12 +849,12 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load U
+ "ld1 {v0.16b}, [%0], #16 \n" // load U
MEMACCESS(1)
- "vld1.8 {q1}, [%1]! \n" // load V
+ "ld1 {v1.16b}, [%1], #16 \n" // load V
"subs %3, %3, #16 \n" // 16 processed per loop
MEMACCESS(2)
- "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
+ "st2 {v0.16b, v1.16b}, [%2], #32 \n" // store 16 pairs of UV
"bgt 1b \n"
:
"+r"(src_u), // %0
@@ -862,7 +862,7 @@
"+r"(dst_uv), // %2
"+r"(width) // %3 // Output registers
: // Input registers
- : "cc", "memory", "q0", "q1" // Clobber List
+ : "cc", "memory", "v0", "v1" // Clobber List
);
}
#endif // HAS_MERGEUVROW_NEON
@@ -874,16 +874,16 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
+ "ld1 {v0.8b-v3.8b}, [%0], #32 \n" // load 32
"subs %2, %2, #32 \n" // 32 processed per loop
MEMACCESS(1)
- "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
+ "st1 {v0.8b-v3.8b}, [%1], #32 \n" // store 32
"bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(count) // %2 // Output registers
: // Input registers
- : "cc", "memory", "q0", "q1" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
#endif // HAS_COPYROW_NEON
@@ -892,16 +892,16 @@
#ifdef HAS_SETROW_NEON
void SetRow_NEON(uint8* dst, uint32 v32, int count) {
asm volatile (
- "vdup.u32 q0, %2 \n" // duplicate 4 ints
+ "dup v0.4s, %w2 \n" // duplicate 4 ints
"1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop
MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n" // store
+ "st1 {v0.16b}, [%0], #16 \n" // store
"bgt 1b \n"
: "+r"(dst), // %0
"+r"(count) // %1
: "r"(v32) // %2
- : "cc", "memory", "q0"
+ : "cc", "memory", "v0"
);
}
#endif // HAS_SETROW_NEON
@@ -922,26 +922,25 @@
void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
asm volatile (
// Start at end of source row.
- "mov r3, #-16 \n"
"add %0, %0, %2 \n"
- "sub %0, #16 \n"
+ "sub %0, %0, #16 \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0}, [%0], r3 \n" // src -= 16
- "subs %2, #16 \n" // 16 pixels per loop.
- "vrev64.8 q0, q0 \n"
+ "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
+ "subs %2, %2, #16 \n" // 16 pixels per loop.
+ "rev64 v0.16b, v0.16b \n"
MEMACCESS(1)
- "vst1.8 {d1}, [%1]! \n" // dst += 16
+ "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n"
+ "st1 {v0.D}[0], [%1], #8 \n"
"bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
- :
- : "cc", "memory", "r3", "q0"
+ : "r"((ptrdiff_t)-16) // %3
+ : "cc", "memory", "v0"
);
}
#endif // HAS_MIRRORROW_NEON
@@ -951,27 +950,27 @@
int width) {
asm volatile (
// Start at end of source row.
- "mov r12, #-16 \n"
"add %0, %0, %3, lsl #1 \n"
- "sub %0, #16 \n"
+ "sub %0, %0, #16 \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
- "subs %3, #8 \n" // 8 pixels per loop.
- "vrev64.8 q0, q0 \n"
+ "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
+ "subs %3, %3, #8 \n" // 8 pixels per loop.
+ "rev64 v0.8b, v0.8b \n"
+ "rev64 v1.8b, v1.8b \n"
MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // dst += 8
+ "st1 {v0.8b}, [%1], #8 \n" // dst += 8
MEMACCESS(2)
- "vst1.8 {d1}, [%2]! \n"
+ "st1 {v1.8b}, [%2], #8 \n"
"bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width) // %3
- :
- : "cc", "memory", "r12", "q0"
+ : "r"((ptrdiff_t)-16) // %4
+ : "cc", "memory", "v0", "v1"
);
}
#endif // HAS_MIRRORUVROW_NEON
@@ -980,26 +979,25 @@
void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
asm volatile (
// Start at end of source row.
- "mov r3, #-16 \n"
"add %0, %0, %2, lsl #2 \n"
- "sub %0, #16 \n"
+ "sub %0, %0, #16 \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0}, [%0], r3 \n" // src -= 16
- "subs %2, #4 \n" // 4 pixels per loop.
- "vrev64.32 q0, q0 \n"
+ "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
+ "subs %2, %2, #4 \n" // 4 pixels per loop.
+ "rev64 v0.4s, v0.4s \n"
MEMACCESS(1)
- "vst1.8 {d1}, [%1]! \n" // dst += 16
+ "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n"
+ "st1 {v0.D}[0], [%1], #8 \n"
"bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
- :
- : "cc", "memory", "r3", "q0"
+ : "r"((ptrdiff_t)-16) // %3
+ : "cc", "memory", "v0"
);
}
#endif // HAS_ARGBMIRRORROW_NEON
@@ -1007,20 +1005,20 @@
#ifdef HAS_RGB24TOARGBROW_NEON
void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
asm volatile (
- "vmov.u8 d4, #255 \n" // Alpha
+ "movi v4.8b, #255 \n" // Alpha
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
+ "ld3 {v1.8b-v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop.
MEMACCESS(1)
- "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "st4 {v1.8b-v4.8b}, [%1], #32 \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
);
}
#endif // HAS_RGB24TOARGBROW_NEON
@@ -1028,21 +1026,22 @@
#ifdef HAS_RAWTOARGBROW_NEON
void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
asm volatile (
- "vmov.u8 d4, #255 \n" // Alpha
+ "movi v5.8b, #255 \n" // Alpha
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
+ "ld3 {v0.8b-v2.8b}, [%0], #24 \n" // read r g b
"subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d1, d3 \n" // swap R, B
+ "mov v3.8b, v1.8b \n" // move g
+ "mov v4.8b, v0.8b \n" // move r
MEMACCESS(1)
- "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "st4 {v2.8b-v5.8b}, [%1], #32 \n" // store b g r a
"bgt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
);
}
#endif // HAS_RAWTOARGBROW_NEON
@@ -1170,16 +1169,16 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
+ "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
MEMACCESS(1)
- "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24.
+ "st3 {v1.8b-v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb24), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
);
}
#endif // HAS_ARGBTORGB24ROW_NEON
@@ -1190,17 +1189,18 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
+ "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load b g r a
"subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d1, d3 \n" // swap R, B
+ "mov v4.8b, v2.8b \n" // mov g
+ "mov v5.8b, v1.8b \n" // mov b
MEMACCESS(1)
- "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
+ "st3 {v3.8b-v5.8b}, [%1], #24 \n" // store r g b
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_raw), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
);
}
#endif // HAS_ARGBTORAWROW_NEON
@@ -1211,16 +1211,16 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
+ "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
"subs %2, %2, #16 \n" // 16 processed per loop.
MEMACCESS(1)
- "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
+ "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
"bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "q0", "q1" // Clobber List
+ : "cc", "memory", "v0", "v1" // Clobber List
);
}
#endif // HAS_YUY2TOYROW_NEON
@@ -1231,16 +1231,16 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
+ "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
"subs %2, %2, #16 \n" // 16 processed per loop.
MEMACCESS(1)
- "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
+ "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
"bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "q0", "q1" // Clobber List
+ : "cc", "memory", "v0", "v1" // Clobber List
);
}
#endif // HAS_UYVYTOYROW_NEON
@@ -1252,19 +1252,19 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
+ "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1)
- "vst1.8 {d1}, [%1]! \n" // store 8 U.
+ "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
MEMACCESS(2)
- "vst1.8 {d3}, [%2]! \n" // store 8 V.
+ "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(pix) // %3
:
- : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
#endif // HAS_YUY2TOUV422ROW_NEON
@@ -1276,19 +1276,19 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
+ "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 U.
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
MEMACCESS(2)
- "vst1.8 {d2}, [%2]! \n" // store 8 V.
+ "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(pix) // %3
:
- : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
#endif // HAS_UYVYTOUV422ROW_NEON
@@ -1297,20 +1297,20 @@
void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
- "add %1, %0, %1 \n" // stride + src_yuy2
+ "add %x1, %x0, %w1, sxtw \n" // stride + src_yuy2
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
+ "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1)
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
- "vrhadd.u8 d1, d1, d5 \n" // average rows of U
- "vrhadd.u8 d3, d3, d7 \n" // average rows of V
+ "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row YUY2.
+ "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
+ "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
MEMACCESS(2)
- "vst1.8 {d1}, [%2]! \n" // store 8 U.
+ "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
MEMACCESS(3)
- "vst1.8 {d3}, [%3]! \n" // store 8 V.
+ "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(stride_yuy2), // %1
@@ -1318,7 +1318,7 @@
"+r"(dst_v), // %3
"+r"(pix) // %4
:
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List
);
}
#endif // HAS_YUY2TOUVROW_NEON
@@ -1327,20 +1327,20 @@
void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
- "add %1, %0, %1 \n" // stride + src_uyvy
+ "add %x1, %x0, %w1, sxtw \n" // stride + src_uyvy
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
+ "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1)
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
- "vrhadd.u8 d0, d0, d4 \n" // average rows of U
- "vrhadd.u8 d2, d2, d6 \n" // average rows of V
+ "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row UYVY.
+ "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
+ "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 U.
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
MEMACCESS(3)
- "vst1.8 {d2}, [%3]! \n" // store 8 V.
+ "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(stride_uyvy), // %1
@@ -1348,7 +1348,7 @@
"+r"(dst_v), // %3
"+r"(pix) // %4
:
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List
);
}
#endif // HAS_UYVYTOUVROW_NEON
@@ -1358,23 +1358,23 @@
uint8* dst_uv, int pix) {
asm volatile (
// change the stride to row 2 pointer
- "add %1, %0 \n"
+ "add %x1, %x0, %w1, sxtw \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels.
+ "ld1 {v0.16b}, [%0], #16 \n" // load row 1 16 pixels.
"subs %3, %3, #16 \n" // 16 processed per loop
MEMACCESS(1)
- "vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels.
- "vrhadd.u8 q0, q1 \n" // average row 1 and 2
+ "ld1 {v1.16b}, [%1], #16 \n" // load row 2 16 pixels.
+ "urhadd v0.16b, v0.16b, v1.16b \n" // average row 1 and 2
MEMACCESS(2)
- "vst1.8 {q0}, [%2]! \n"
+ "st1 {v0.16b}, [%2], #16 \n"
"bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(src_uv_stride), // %1
"+r"(dst_uv), // %2
"+r"(pix) // %3
:
- : "cc", "memory", "q0", "q1" // Clobber List
+ : "cc", "memory", "v0", "v1" // Clobber List
);
}
#endif // HAS_HALFROW_NEON
@@ -1384,22 +1384,22 @@
void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
uint32 selector, int pix) {
asm volatile (
- "vmov.u32 d6[0], %3 \n" // selector
+ "mov v2.s[0], %w3 \n" // selector
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels.
+ "ld1 {v0.16b, v1.16b}, [%0], 32 \n" // load row 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop
- "vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels
- "vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels
- "vtrn.u32 d4, d5 \n" // combine 8 pixels
+ "tbl v4.8b, {v0.16b}, v2.8b \n" // look up 4 pixels
+ "tbl v5.8b, {v1.16b}, v2.8b \n" // look up 4 pixels
+ "trn1 v4.4s, v4.4s, v5.4s \n" // combine 8 pixels
MEMACCESS(1)
- "vst1.8 {d4}, [%1]! \n" // store 8.
+ "st1 {v4.8b}, [%1], #8 \n" // store 8.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_bayer), // %1
"+r"(pix) // %2
: "r"(selector) // %3
- : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v4", "v5" // Clobber List
);
}
#endif // HAS_ARGBTOBAYERROW_NEON
@@ -1411,16 +1411,16 @@
asm volatile (
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels.
+ "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load row 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop
MEMACCESS(1)
- "vst1.8 {d1}, [%1]! \n" // store 8 G's.
+ "st1 {v1.8b}, [%1], #8 \n" // store 8 G's.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_bayer), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "q0", "q1" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
#endif // HAS_ARGBTOBAYERGGROW_NEON
@@ -1431,21 +1431,20 @@
const uint8* shuffler, int pix) {
asm volatile (
MEMACCESS(3)
- "vld1.8 {q2}, [%3] \n" // shuffler
+ "ld1 {v2.16b}, [%3] \n" // shuffler
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
"subs %2, %2, #4 \n" // 4 processed per loop
- "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
- "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
+ "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
MEMACCESS(1)
- "vst1.8 {q1}, [%1]! \n" // store 4.
+ "st1 {v1.16b}, [%1], #16 \n" // store 4.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
: "r"(shuffler) // %3
- : "cc", "memory", "q0", "q1", "q2" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2" // Clobber List
);
}
#endif // HAS_ARGBSHUFFLEROW_NEON
@@ -1459,14 +1458,15 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
+ "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
+ "mov v2.8b, v1.8b \n"
MEMACCESS(1)
- "vld1.8 {d1}, [%1]! \n" // load 8 Us
+ "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
MEMACCESS(2)
- "vld1.8 {d3}, [%2]! \n" // load 8 Vs
+ "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels
MEMACCESS(3)
- "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
+ "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 YUY2/16 pixels.
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
@@ -1474,7 +1474,7 @@
"+r"(dst_yuy2), // %3
"+r"(width) // %4
:
- : "cc", "memory", "d0", "d1", "d2", "d3"
+ : "cc", "memory", "v0", "v1", "v2", "v3"
);
}
#endif // HAS_I422TOYUY2ROW_NEON
@@ -1488,14 +1488,15 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
+ "ld2 {v1.8b, v2.8b}, [%0], #16 \n" // load 16 Ys
+ "mov v3.8b, v2.8b \n"
MEMACCESS(1)
- "vld1.8 {d0}, [%1]! \n" // load 8 Us
+ "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
MEMACCESS(2)
- "vld1.8 {d2}, [%2]! \n" // load 8 Vs
+ "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels
MEMACCESS(3)
- "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
+ "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 UYVY/16 pixels.
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
@@ -1503,7 +1504,7 @@
"+r"(dst_uyvy), // %3
"+r"(width) // %4
:
- : "cc", "memory", "d0", "d1", "d2", "d3"
+ : "cc", "memory", "v0", "v1", "v2", "v3"
);
}
#endif // HAS_I422TOUYVYROW_NEON
@@ -1577,28 +1578,28 @@
#ifdef HAS_ARGBTOYROW_NEON
void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
- "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
+ "movi v4.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
+ "umull v3.8h, v0.8b, v4.8b \n" // B
+ "umlal v3.8h, v1.8b, v5.8b \n" // G
+ "umlal v3.8h, v2.8b, v6.8b \n" // R
+ "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
);
}
#endif // HAS_ARGBTOYROW_NEON
@@ -1606,26 +1607,26 @@
#ifdef HAS_ARGBTOYJROW_NEON
void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
- "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
- "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
- "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
+ "movi v4.8b, #15 \n" // B * 0.11400 coefficient
+ "movi v5.8b, #75 \n" // G * 0.58700 coefficient
+ "movi v6.8b, #38 \n" // R * 0.29900 coefficient
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y
+ "umull v3.8h, v0.8b, v4.8b \n" // B
+ "umlal v3.8h, v1.8b, v5.8b \n" // G
+ "umlal v3.8h, v2.8b, v6.8b \n" // R
+ "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y
MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
);
}
#endif // HAS_ARGBTOYJROW_NEON
@@ -3048,20 +3049,20 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
MEMACCESS(1)
- "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels.
+ "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
- "vmull.u8 q0, d0, d1 \n" // multiply B
- "vmull.u8 q1, d2, d3 \n" // multiply G
- "vmull.u8 q2, d4, d5 \n" // multiply R
- "vmull.u8 q3, d6, d7 \n" // multiply A
- "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B
- "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
- "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
- "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
+ "umull v0.8h, v0.8b, v4.8b \n" // multiply B
+ "umull v1.8h, v1.8b, v5.8b \n" // multiply G
+ "umull v2.8h, v2.8b, v6.8b \n" // multiply R
+ "umull v3.8h, v3.8b, v7.8b \n" // multiply A
+ "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
+ "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
+ "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
+ "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_argb0), // %0
@@ -3069,7 +3070,7 @@
"+r"(dst_argb), // %2
"+r"(width) // %3
:
- : "cc", "memory", "q0", "q1", "q2", "q3"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
);
}
#endif // HAS_ARGBMULTIPLYROW_NEON
@@ -3083,14 +3084,16 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
MEMACCESS(1)
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
+ "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 q0, q0, q2 \n" // add B, G
- "vqadd.u8 q1, q1, q3 \n" // add R, A
+ "uqadd v0.8b, v0.8b, v4.8b \n"
+ "uqadd v1.8b, v1.8b, v5.8b \n"
+ "uqadd v2.8b, v2.8b, v6.8b \n"
+ "uqadd v3.8b, v3.8b, v7.8b \n"
MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_argb0), // %0
@@ -3098,7 +3101,7 @@
"+r"(dst_argb), // %2
"+r"(width) // %3
:
- : "cc", "memory", "q0", "q1", "q2", "q3"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
);
}
#endif // HAS_ARGBADDROW_NEON
@@ -3112,14 +3115,16 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
MEMACCESS(1)
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
+ "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
- "vqsub.u8 q0, q0, q2 \n" // subtract B, G
- "vqsub.u8 q1, q1, q3 \n" // subtract R, A
+ "uqsub v0.8b, v0.8b, v4.8b \n"
+ "uqsub v1.8b, v1.8b, v5.8b \n"
+ "uqsub v2.8b, v2.8b, v6.8b \n"
+ "uqsub v3.8b, v3.8b, v7.8b \n"
MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_argb0), // %0
@@ -3127,7 +3132,7 @@
"+r"(dst_argb), // %2
"+r"(width) // %3
:
- : "cc", "memory", "q0", "q1", "q2", "q3"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
);
}
#endif // HAS_ARGBSUBTRACTROW_NEON
@@ -3141,27 +3146,27 @@
void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) {
asm volatile (
- "vmov.u8 d3, #255 \n" // alpha
+ "movi v3.8b, #255 \n" // alpha
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
+ "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
MEMACCESS(1)
- "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
+ "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 d0, d0, d1 \n" // add
- "vmov.u8 d1, d0 \n"
- "vmov.u8 d2, d0 \n"
+ "uqadd v0.8b, v0.8b, v1.8b \n" // add
+ "mov v1.8b, v0.8b \n"
+ "mov v2.8b, v0.8b \n"
MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
:
- : "cc", "memory", "q0", "q1"
+ : "cc", "memory", "v0", "v1", "v2", "v3"
);
}
#endif // HAS_SOBELROW_NEON
@@ -3175,20 +3180,20 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
MEMACCESS(1)
- "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
+ "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
"subs %3, %3, #16 \n" // 16 processed per loop.
- "vqadd.u8 q0, q0, q1 \n" // add
+ "uqadd v0.16b, v0.16b, v1.16b \n" // add
MEMACCESS(2)
- "vst1.8 {q0}, [%2]! \n" // store 16 pixels.
+ "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
"bgt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_y), // %2
"+r"(width) // %3
:
- : "cc", "memory", "q0", "q1"
+ : "cc", "memory", "v0", "v1"
);
}
#endif // HAS_SOBELTOPLANEROW_NEON
@@ -3202,25 +3207,25 @@
void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) {
asm volatile (
- "vmov.u8 d3, #255 \n" // alpha
+ "movi v3.8b, #255 \n" // alpha
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
+ "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
MEMACCESS(1)
- "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
+ "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 d1, d0, d2 \n" // add
+ "uqadd v1.8b, v0.8b, v2.8b \n" // add
MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
:
- : "cc", "memory", "q0", "q1"
+ : "cc", "memory", "v0", "v1", "v2", "v3"
);
}
#endif // HAS_SOBELXYROW_NEON
@@ -3236,28 +3241,28 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {d0}, [%0],%5 \n" // top
+ "ld1 {v0.8b}, [%0],%5 \n" // top
MEMACCESS(0)
- "vld1.8 {d1}, [%0],%6 \n"
- "vsubl.u8 q0, d0, d1 \n"
+ "ld1 {v1.8b}, [%0],%6 \n"
+ "usubl v0.8h, v0.8b, v1.8b \n"
MEMACCESS(1)
- "vld1.8 {d2}, [%1],%5 \n" // center * 2
+ "ld1 {v2.8b}, [%1],%5 \n" // center * 2
MEMACCESS(1)
- "vld1.8 {d3}, [%1],%6 \n"
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vadd.s16 q0, q0, q1 \n"
+ "ld1 {v3.8b}, [%1],%6 \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "add v0.8h, v0.8h, v1.8h \n"
MEMACCESS(2)
- "vld1.8 {d2}, [%2],%5 \n" // bottom
+ "ld1 {v2.8b}, [%2],%5 \n" // bottom
MEMACCESS(2)
- "vld1.8 {d3}, [%2],%6 \n"
+ "ld1 {v3.8b}, [%2],%6 \n"
"subs %4, %4, #8 \n" // 8 pixels
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vabs.s16 q0, q0 \n"
- "vqmovn.u16 d0, q0 \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "abs v0.8h, v0.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
MEMACCESS(3)
- "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
+ "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
"bgt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
@@ -3266,7 +3271,7 @@
"+r"(width) // %4
: "r"(2), // %5
"r"(6) // %6
- : "cc", "memory", "q0", "q1" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
#endif // HAS_SOBELXROW_NEON
@@ -3282,28 +3287,28 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {d0}, [%0],%4 \n" // left
+ "ld1 {v0.8b}, [%0],%4 \n" // left
MEMACCESS(1)
- "vld1.8 {d1}, [%1],%4 \n"
- "vsubl.u8 q0, d0, d1 \n"
+ "ld1 {v1.8b}, [%1],%4 \n"
+ "usubl v0.8h, v0.8b, v1.8b \n"
MEMACCESS(0)
- "vld1.8 {d2}, [%0],%4 \n" // center * 2
+ "ld1 {v2.8b}, [%0],%4 \n" // center * 2
MEMACCESS(1)
- "vld1.8 {d3}, [%1],%4 \n"
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vadd.s16 q0, q0, q1 \n"
+ "ld1 {v3.8b}, [%1],%4 \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "add v0.8h, v0.8h, v1.8h \n"
MEMACCESS(0)
- "vld1.8 {d2}, [%0],%5 \n" // right
+ "ld1 {v2.8b}, [%0],%5 \n" // right
MEMACCESS(1)
- "vld1.8 {d3}, [%1],%5 \n"
+ "ld1 {v3.8b}, [%1],%5 \n"
"subs %3, %3, #8 \n" // 8 pixels
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vabs.s16 q0, q0 \n"
- "vqmovn.u16 d0, q0 \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "abs v0.8h, v0.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 sobely
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
"bgt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
@@ -3311,7 +3316,7 @@
"+r"(width) // %3
: "r"(1), // %4
"r"(6) // %5
- : "cc", "memory", "q0", "q1" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
#endif // HAS_SOBELYROW_NEON
« no previous file with comments | « source/libvpx/third_party/libyuv/source/row_any.cc ('k') | source/libvpx/third_party/libyuv/source/row_win.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698