| Index: source/libvpx/third_party/libyuv/source/row_neon64.cc
|
| ===================================================================
|
| --- source/libvpx/third_party/libyuv/source/row_neon64.cc (revision 291857)
|
| +++ source/libvpx/third_party/libyuv/source/row_neon64.cc (working copy)
|
| @@ -824,19 +824,19 @@
|
| ".p2align 2 \n"
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
|
| + "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pairs of UV
|
| "subs %3, %3, #16 \n" // 16 processed per loop
|
| MEMACCESS(1)
|
| - "vst1.8 {q0}, [%1]! \n" // store U
|
| + "st1 {v0.16b}, [%1], #16 \n" // store U
|
| MEMACCESS(2)
|
| - "vst1.8 {q1}, [%2]! \n" // store V
|
| + "st1 {v1.16b}, [%2], #16 \n" // store V
|
| "bgt 1b \n"
|
| : "+r"(src_uv), // %0
|
| "+r"(dst_u), // %1
|
| "+r"(dst_v), // %2
|
| "+r"(width) // %3 // Output registers
|
| : // Input registers
|
| - : "cc", "memory", "q0", "q1" // Clobber List
|
| + : "cc", "memory", "v0", "v1" // Clobber List
|
| );
|
| }
|
| #endif // HAS_SPLITUVROW_NEON
|
| @@ -849,12 +849,12 @@
|
| ".p2align 2 \n"
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld1.8 {q0}, [%0]! \n" // load U
|
| + "ld1 {v0.16b}, [%0], #16 \n" // load U
|
| MEMACCESS(1)
|
| - "vld1.8 {q1}, [%1]! \n" // load V
|
| + "ld1 {v1.16b}, [%1], #16 \n" // load V
|
| "subs %3, %3, #16 \n" // 16 processed per loop
|
| MEMACCESS(2)
|
| - "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
|
| + "st2 {v0.16b, v1.16b}, [%2], #32 \n" // store 16 pairs of UV
|
| "bgt 1b \n"
|
| :
|
| "+r"(src_u), // %0
|
| @@ -862,7 +862,7 @@
|
| "+r"(dst_uv), // %2
|
| "+r"(width) // %3 // Output registers
|
| : // Input registers
|
| - : "cc", "memory", "q0", "q1" // Clobber List
|
| + : "cc", "memory", "v0", "v1" // Clobber List
|
| );
|
| }
|
| #endif // HAS_MERGEUVROW_NEON
|
| @@ -874,16 +874,16 @@
|
| ".p2align 2 \n"
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
|
| + "ld1 {v0.8b-v3.8b}, [%0], #32 \n" // load 32
|
| "subs %2, %2, #32 \n" // 32 processed per loop
|
| MEMACCESS(1)
|
| - "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
|
| + "st1 {v0.8b-v3.8b}, [%1], #32 \n" // store 32
|
| "bgt 1b \n"
|
| : "+r"(src), // %0
|
| "+r"(dst), // %1
|
| "+r"(count) // %2 // Output registers
|
| : // Input registers
|
| - : "cc", "memory", "q0", "q1" // Clobber List
|
| + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
|
| );
|
| }
|
| #endif // HAS_COPYROW_NEON
|
| @@ -892,16 +892,16 @@
|
| #ifdef HAS_SETROW_NEON
|
| void SetRow_NEON(uint8* dst, uint32 v32, int count) {
|
| asm volatile (
|
| - "vdup.u32 q0, %2 \n" // duplicate 4 ints
|
| + "dup v0.4s, %w2 \n" // duplicate 4 ints
|
| "1: \n"
|
| "subs %1, %1, #16 \n" // 16 bytes per loop
|
| MEMACCESS(0)
|
| - "vst1.8 {q0}, [%0]! \n" // store
|
| + "st1 {v0.16b}, [%0], #16 \n" // store
|
| "bgt 1b \n"
|
| : "+r"(dst), // %0
|
| "+r"(count) // %1
|
| : "r"(v32) // %2
|
| - : "cc", "memory", "q0"
|
| + : "cc", "memory", "v0"
|
| );
|
| }
|
| #endif // HAS_SETROW_NEON
|
| @@ -922,26 +922,25 @@
|
| void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
|
| asm volatile (
|
| // Start at end of source row.
|
| - "mov r3, #-16 \n"
|
| "add %0, %0, %2 \n"
|
| - "sub %0, #16 \n"
|
| + "sub %0, %0, #16 \n"
|
|
|
| ".p2align 2 \n"
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld1.8 {q0}, [%0], r3 \n" // src -= 16
|
| - "subs %2, #16 \n" // 16 pixels per loop.
|
| - "vrev64.8 q0, q0 \n"
|
| + "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
|
| + "subs %2, %2, #16 \n" // 16 pixels per loop.
|
| + "rev64 v0.16b, v0.16b \n"
|
| MEMACCESS(1)
|
| - "vst1.8 {d1}, [%1]! \n" // dst += 16
|
| + "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
|
| MEMACCESS(1)
|
| - "vst1.8 {d0}, [%1]! \n"
|
| + "st1 {v0.D}[0], [%1], #8 \n"
|
| "bgt 1b \n"
|
| : "+r"(src), // %0
|
| "+r"(dst), // %1
|
| "+r"(width) // %2
|
| - :
|
| - : "cc", "memory", "r3", "q0"
|
| + : "r"((ptrdiff_t)-16) // %3
|
| + : "cc", "memory", "v0"
|
| );
|
| }
|
| #endif // HAS_MIRRORROW_NEON
|
| @@ -951,27 +950,27 @@
|
| int width) {
|
| asm volatile (
|
| // Start at end of source row.
|
| - "mov r12, #-16 \n"
|
| "add %0, %0, %3, lsl #1 \n"
|
| - "sub %0, #16 \n"
|
| + "sub %0, %0, #16 \n"
|
|
|
| ".p2align 2 \n"
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
|
| - "subs %3, #8 \n" // 8 pixels per loop.
|
| - "vrev64.8 q0, q0 \n"
|
| + "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
|
| + "subs %3, %3, #8 \n" // 8 pixels per loop.
|
| + "rev64 v0.8b, v0.8b \n"
|
| + "rev64 v1.8b, v1.8b \n"
|
| MEMACCESS(1)
|
| - "vst1.8 {d0}, [%1]! \n" // dst += 8
|
| + "st1 {v0.8b}, [%1], #8 \n" // dst += 8
|
| MEMACCESS(2)
|
| - "vst1.8 {d1}, [%2]! \n"
|
| + "st1 {v1.8b}, [%2], #8 \n"
|
| "bgt 1b \n"
|
| : "+r"(src_uv), // %0
|
| "+r"(dst_u), // %1
|
| "+r"(dst_v), // %2
|
| "+r"(width) // %3
|
| - :
|
| - : "cc", "memory", "r12", "q0"
|
| + : "r"((ptrdiff_t)-16) // %4
|
| + : "cc", "memory", "v0", "v1"
|
| );
|
| }
|
| #endif // HAS_MIRRORUVROW_NEON
|
| @@ -980,26 +979,25 @@
|
| void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
|
| asm volatile (
|
| // Start at end of source row.
|
| - "mov r3, #-16 \n"
|
| "add %0, %0, %2, lsl #2 \n"
|
| - "sub %0, #16 \n"
|
| + "sub %0, %0, #16 \n"
|
|
|
| ".p2align 2 \n"
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld1.8 {q0}, [%0], r3 \n" // src -= 16
|
| - "subs %2, #4 \n" // 4 pixels per loop.
|
| - "vrev64.32 q0, q0 \n"
|
| + "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
|
| + "subs %2, %2, #4 \n" // 4 pixels per loop.
|
| + "rev64 v0.4s, v0.4s \n"
|
| MEMACCESS(1)
|
| - "vst1.8 {d1}, [%1]! \n" // dst += 16
|
| + "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
|
| MEMACCESS(1)
|
| - "vst1.8 {d0}, [%1]! \n"
|
| + "st1 {v0.D}[0], [%1], #8 \n"
|
| "bgt 1b \n"
|
| : "+r"(src), // %0
|
| "+r"(dst), // %1
|
| "+r"(width) // %2
|
| - :
|
| - : "cc", "memory", "r3", "q0"
|
| + : "r"((ptrdiff_t)-16) // %3
|
| + : "cc", "memory", "v0"
|
| );
|
| }
|
| #endif // HAS_ARGBMIRRORROW_NEON
|
| @@ -1007,20 +1005,20 @@
|
| #ifdef HAS_RGB24TOARGBROW_NEON
|
| void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
|
| asm volatile (
|
| - "vmov.u8 d4, #255 \n" // Alpha
|
| + "movi v4.8b, #255 \n" // Alpha
|
| ".p2align 2 \n"
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
|
| + "ld3 {v1.8b-v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
|
| "subs %2, %2, #8 \n" // 8 processed per loop.
|
| MEMACCESS(1)
|
| - "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
|
| + "st4 {v1.8b-v4.8b}, [%1], #32 \n" // store 8 pixels of ARGB.
|
| "bgt 1b \n"
|
| : "+r"(src_rgb24), // %0
|
| "+r"(dst_argb), // %1
|
| "+r"(pix) // %2
|
| :
|
| - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
|
| + : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
|
| );
|
| }
|
| #endif // HAS_RGB24TOARGBROW_NEON
|
| @@ -1028,21 +1026,22 @@
|
| #ifdef HAS_RAWTOARGBROW_NEON
|
| void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
|
| asm volatile (
|
| - "vmov.u8 d4, #255 \n" // Alpha
|
| + "movi v5.8b, #255 \n" // Alpha
|
| ".p2align 2 \n"
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
|
| + "ld3 {v0.8b-v2.8b}, [%0], #24 \n" // read r g b
|
| "subs %2, %2, #8 \n" // 8 processed per loop.
|
| - "vswp.u8 d1, d3 \n" // swap R, B
|
| + "mov v3.8b, v1.8b \n" // move g
|
| + "mov v4.8b, v0.8b \n" // move r
|
| MEMACCESS(1)
|
| - "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
|
| + "st4 {v2.8b-v5.8b}, [%1], #32 \n" // store b g r a
|
| "bgt 1b \n"
|
| : "+r"(src_raw), // %0
|
| "+r"(dst_argb), // %1
|
| "+r"(pix) // %2
|
| :
|
| - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
|
| + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
|
| );
|
| }
|
| #endif // HAS_RAWTOARGBROW_NEON
|
| @@ -1170,16 +1169,16 @@
|
| ".p2align 2 \n"
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
|
| + "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load 8 pixels of ARGB.
|
| "subs %2, %2, #8 \n" // 8 processed per loop.
|
| MEMACCESS(1)
|
| - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24.
|
| + "st3 {v1.8b-v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24.
|
| "bgt 1b \n"
|
| : "+r"(src_argb), // %0
|
| "+r"(dst_rgb24), // %1
|
| "+r"(pix) // %2
|
| :
|
| - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
|
| + : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
|
| );
|
| }
|
| #endif // HAS_ARGBTORGB24ROW_NEON
|
| @@ -1190,17 +1189,18 @@
|
| ".p2align 2 \n"
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
|
| + "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load b g r a
|
| "subs %2, %2, #8 \n" // 8 processed per loop.
|
| - "vswp.u8 d1, d3 \n" // swap R, B
|
| + "mov v4.8b, v2.8b \n" // mov g
|
| + "mov v5.8b, v1.8b \n" // mov b
|
| MEMACCESS(1)
|
| - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
|
| + "st3 {v3.8b-v5.8b}, [%1], #24 \n" // store r g b
|
| "bgt 1b \n"
|
| : "+r"(src_argb), // %0
|
| "+r"(dst_raw), // %1
|
| "+r"(pix) // %2
|
| :
|
| - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
|
| + : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
|
| );
|
| }
|
| #endif // HAS_ARGBTORAWROW_NEON
|
| @@ -1211,16 +1211,16 @@
|
| ".p2align 2 \n"
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
|
| + "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
|
| "subs %2, %2, #16 \n" // 16 processed per loop.
|
| MEMACCESS(1)
|
| - "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
|
| + "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
|
| "bgt 1b \n"
|
| : "+r"(src_yuy2), // %0
|
| "+r"(dst_y), // %1
|
| "+r"(pix) // %2
|
| :
|
| - : "cc", "memory", "q0", "q1" // Clobber List
|
| + : "cc", "memory", "v0", "v1" // Clobber List
|
| );
|
| }
|
| #endif // HAS_YUY2TOYROW_NEON
|
| @@ -1231,16 +1231,16 @@
|
| ".p2align 2 \n"
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
|
| + "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
|
| "subs %2, %2, #16 \n" // 16 processed per loop.
|
| MEMACCESS(1)
|
| - "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
|
| + "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
|
| "bgt 1b \n"
|
| : "+r"(src_uyvy), // %0
|
| "+r"(dst_y), // %1
|
| "+r"(pix) // %2
|
| :
|
| - : "cc", "memory", "q0", "q1" // Clobber List
|
| + : "cc", "memory", "v0", "v1" // Clobber List
|
| );
|
| }
|
| #endif // HAS_UYVYTOYROW_NEON
|
| @@ -1252,19 +1252,19 @@
|
| ".p2align 2 \n"
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
|
| + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2.
|
| "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
|
| MEMACCESS(1)
|
| - "vst1.8 {d1}, [%1]! \n" // store 8 U.
|
| + "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
|
| MEMACCESS(2)
|
| - "vst1.8 {d3}, [%2]! \n" // store 8 V.
|
| + "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
|
| "bgt 1b \n"
|
| : "+r"(src_yuy2), // %0
|
| "+r"(dst_u), // %1
|
| "+r"(dst_v), // %2
|
| "+r"(pix) // %3
|
| :
|
| - : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
|
| + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
|
| );
|
| }
|
| #endif // HAS_YUY2TOUV422ROW_NEON
|
| @@ -1276,19 +1276,19 @@
|
| ".p2align 2 \n"
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
|
| + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY.
|
| "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
|
| MEMACCESS(1)
|
| - "vst1.8 {d0}, [%1]! \n" // store 8 U.
|
| + "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
|
| MEMACCESS(2)
|
| - "vst1.8 {d2}, [%2]! \n" // store 8 V.
|
| + "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
|
| "bgt 1b \n"
|
| : "+r"(src_uyvy), // %0
|
| "+r"(dst_u), // %1
|
| "+r"(dst_v), // %2
|
| "+r"(pix) // %3
|
| :
|
| - : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
|
| + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
|
| );
|
| }
|
| #endif // HAS_UYVYTOUV422ROW_NEON
|
| @@ -1297,20 +1297,20 @@
|
| void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
|
| uint8* dst_u, uint8* dst_v, int pix) {
|
| asm volatile (
|
| - "add %1, %0, %1 \n" // stride + src_yuy2
|
| + "add %x1, %x0, %w1, sxtw \n" // stride + src_yuy2
|
| ".p2align 2 \n"
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
|
| + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2.
|
| "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
|
| MEMACCESS(1)
|
| - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
|
| - "vrhadd.u8 d1, d1, d5 \n" // average rows of U
|
| - "vrhadd.u8 d3, d3, d7 \n" // average rows of V
|
| + "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row YUY2.
|
| + "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
|
| + "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
|
| MEMACCESS(2)
|
| - "vst1.8 {d1}, [%2]! \n" // store 8 U.
|
| + "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
|
| MEMACCESS(3)
|
| - "vst1.8 {d3}, [%3]! \n" // store 8 V.
|
| + "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
|
| "bgt 1b \n"
|
| : "+r"(src_yuy2), // %0
|
| "+r"(stride_yuy2), // %1
|
| @@ -1318,7 +1318,7 @@
|
| "+r"(dst_v), // %3
|
| "+r"(pix) // %4
|
| :
|
| - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
|
| + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List
|
| );
|
| }
|
| #endif // HAS_YUY2TOUVROW_NEON
|
| @@ -1327,20 +1327,20 @@
|
| void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
|
| uint8* dst_u, uint8* dst_v, int pix) {
|
| asm volatile (
|
| - "add %1, %0, %1 \n" // stride + src_uyvy
|
| + "add %x1, %x0, %w1, sxtw \n" // stride + src_uyvy
|
| ".p2align 2 \n"
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
|
| + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY.
|
| "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
|
| MEMACCESS(1)
|
| - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
|
| - "vrhadd.u8 d0, d0, d4 \n" // average rows of U
|
| - "vrhadd.u8 d2, d2, d6 \n" // average rows of V
|
| + "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row UYVY.
|
| + "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
|
| + "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
|
| MEMACCESS(2)
|
| - "vst1.8 {d0}, [%2]! \n" // store 8 U.
|
| + "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
|
| MEMACCESS(3)
|
| - "vst1.8 {d2}, [%3]! \n" // store 8 V.
|
| + "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
|
| "bgt 1b \n"
|
| : "+r"(src_uyvy), // %0
|
| "+r"(stride_uyvy), // %1
|
| @@ -1348,7 +1348,7 @@
|
| "+r"(dst_v), // %3
|
| "+r"(pix) // %4
|
| :
|
| - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
|
| + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List
|
| );
|
| }
|
| #endif // HAS_UYVYTOUVROW_NEON
|
| @@ -1358,23 +1358,23 @@
|
| uint8* dst_uv, int pix) {
|
| asm volatile (
|
| // change the stride to row 2 pointer
|
| - "add %1, %0 \n"
|
| + "add %x1, %x0, %w1, sxtw \n"
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels.
|
| + "ld1 {v0.16b}, [%0], #16 \n" // load row 1 16 pixels.
|
| "subs %3, %3, #16 \n" // 16 processed per loop
|
| MEMACCESS(1)
|
| - "vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels.
|
| - "vrhadd.u8 q0, q1 \n" // average row 1 and 2
|
| + "ld1 {v1.16b}, [%1], #16 \n" // load row 2 16 pixels.
|
| + "urhadd v0.16b, v0.16b, v1.16b \n" // average row 1 and 2
|
| MEMACCESS(2)
|
| - "vst1.8 {q0}, [%2]! \n"
|
| + "st1 {v0.16b}, [%2], #16 \n"
|
| "bgt 1b \n"
|
| : "+r"(src_uv), // %0
|
| "+r"(src_uv_stride), // %1
|
| "+r"(dst_uv), // %2
|
| "+r"(pix) // %3
|
| :
|
| - : "cc", "memory", "q0", "q1" // Clobber List
|
| + : "cc", "memory", "v0", "v1" // Clobber List
|
| );
|
| }
|
| #endif // HAS_HALFROW_NEON
|
| @@ -1384,22 +1384,22 @@
|
| void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
|
| uint32 selector, int pix) {
|
| asm volatile (
|
| - "vmov.u32 d6[0], %3 \n" // selector
|
| + "mov v2.s[0], %w3 \n" // selector
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels.
|
| + "ld1 {v0.16b, v1.16b}, [%0], 32 \n" // load row 8 pixels.
|
| "subs %2, %2, #8 \n" // 8 processed per loop
|
| - "vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels
|
| - "vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels
|
| - "vtrn.u32 d4, d5 \n" // combine 8 pixels
|
| + "tbl v4.8b, {v0.16b}, v2.8b \n" // look up 4 pixels
|
| + "tbl v5.8b, {v1.16b}, v2.8b \n" // look up 4 pixels
|
| + "trn1 v4.4s, v4.4s, v5.4s \n" // combine 8 pixels
|
| MEMACCESS(1)
|
| - "vst1.8 {d4}, [%1]! \n" // store 8.
|
| + "st1 {v4.8b}, [%1], #8 \n" // store 8.
|
| "bgt 1b \n"
|
| : "+r"(src_argb), // %0
|
| "+r"(dst_bayer), // %1
|
| "+r"(pix) // %2
|
| : "r"(selector) // %3
|
| - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
|
| + : "cc", "memory", "v0", "v1", "v2", "v4", "v5" // Clobber List
|
| );
|
| }
|
| #endif // HAS_ARGBTOBAYERROW_NEON
|
| @@ -1411,16 +1411,16 @@
|
| asm volatile (
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels.
|
| + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load row 8 pixels.
|
| "subs %2, %2, #8 \n" // 8 processed per loop
|
| MEMACCESS(1)
|
| - "vst1.8 {d1}, [%1]! \n" // store 8 G's.
|
| + "st1 {v1.8b}, [%1], #8 \n" // store 8 G's.
|
| "bgt 1b \n"
|
| : "+r"(src_argb), // %0
|
| "+r"(dst_bayer), // %1
|
| "+r"(pix) // %2
|
| :
|
| - : "cc", "memory", "q0", "q1" // Clobber List
|
| + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
|
| );
|
| }
|
| #endif // HAS_ARGBTOBAYERGGROW_NEON
|
| @@ -1431,21 +1431,20 @@
|
| const uint8* shuffler, int pix) {
|
| asm volatile (
|
| MEMACCESS(3)
|
| - "vld1.8 {q2}, [%3] \n" // shuffler
|
| + "ld1 {v2.16b}, [%3] \n" // shuffler
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
|
| + "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
|
| "subs %2, %2, #4 \n" // 4 processed per loop
|
| - "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
|
| - "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
|
| + "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
|
| MEMACCESS(1)
|
| - "vst1.8 {q1}, [%1]! \n" // store 4.
|
| + "st1 {v1.16b}, [%1], #16 \n" // store 4.
|
| "bgt 1b \n"
|
| : "+r"(src_argb), // %0
|
| "+r"(dst_argb), // %1
|
| "+r"(pix) // %2
|
| : "r"(shuffler) // %3
|
| - : "cc", "memory", "q0", "q1", "q2" // Clobber List
|
| + : "cc", "memory", "v0", "v1", "v2" // Clobber List
|
| );
|
| }
|
| #endif // HAS_ARGBSHUFFLEROW_NEON
|
| @@ -1459,14 +1458,15 @@
|
| ".p2align 2 \n"
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
|
| + "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
|
| + "mov v2.8b, v1.8b \n"
|
| MEMACCESS(1)
|
| - "vld1.8 {d1}, [%1]! \n" // load 8 Us
|
| + "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
|
| MEMACCESS(2)
|
| - "vld1.8 {d3}, [%2]! \n" // load 8 Vs
|
| + "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
|
| "subs %4, %4, #16 \n" // 16 pixels
|
| MEMACCESS(3)
|
| - "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
|
| + "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 YUY2/16 pixels.
|
| "bgt 1b \n"
|
| : "+r"(src_y), // %0
|
| "+r"(src_u), // %1
|
| @@ -1474,7 +1474,7 @@
|
| "+r"(dst_yuy2), // %3
|
| "+r"(width) // %4
|
| :
|
| - : "cc", "memory", "d0", "d1", "d2", "d3"
|
| + : "cc", "memory", "v0", "v1", "v2", "v3"
|
| );
|
| }
|
| #endif // HAS_I422TOYUY2ROW_NEON
|
| @@ -1488,14 +1488,15 @@
|
| ".p2align 2 \n"
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
|
| + "ld2 {v1.8b, v2.8b}, [%0], #16 \n" // load 16 Ys
|
| + "mov v3.8b, v2.8b \n"
|
| MEMACCESS(1)
|
| - "vld1.8 {d0}, [%1]! \n" // load 8 Us
|
| + "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
|
| MEMACCESS(2)
|
| - "vld1.8 {d2}, [%2]! \n" // load 8 Vs
|
| + "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
|
| "subs %4, %4, #16 \n" // 16 pixels
|
| MEMACCESS(3)
|
| - "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
|
| + "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 UYVY/16 pixels.
|
| "bgt 1b \n"
|
| : "+r"(src_y), // %0
|
| "+r"(src_u), // %1
|
| @@ -1503,7 +1504,7 @@
|
| "+r"(dst_uyvy), // %3
|
| "+r"(width) // %4
|
| :
|
| - : "cc", "memory", "d0", "d1", "d2", "d3"
|
| + : "cc", "memory", "v0", "v1", "v2", "v3"
|
| );
|
| }
|
| #endif // HAS_I422TOUYVYROW_NEON
|
| @@ -1577,28 +1578,28 @@
|
| #ifdef HAS_ARGBTOYROW_NEON
|
| void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
|
| asm volatile (
|
| - "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
|
| - "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
|
| - "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
|
| - "vmov.u8 d27, #16 \n" // Add 16 constant
|
| + "movi v4.8b, #13 \n" // B * 0.1016 coefficient
|
| + "movi v5.8b, #65 \n" // G * 0.5078 coefficient
|
| + "movi v6.8b, #33 \n" // R * 0.2578 coefficient
|
| + "movi v7.8b, #16 \n" // Add 16 constant
|
| ".p2align 2 \n"
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
|
| + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
|
| "subs %2, %2, #8 \n" // 8 processed per loop.
|
| - "vmull.u8 q2, d0, d24 \n" // B
|
| - "vmlal.u8 q2, d1, d25 \n" // G
|
| - "vmlal.u8 q2, d2, d26 \n" // R
|
| - "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
|
| - "vqadd.u8 d0, d27 \n"
|
| + "umull v3.8h, v0.8b, v4.8b \n" // B
|
| + "umlal v3.8h, v1.8b, v5.8b \n" // G
|
| + "umlal v3.8h, v2.8b, v6.8b \n" // R
|
| + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
|
| + "uqadd v0.8b, v0.8b, v7.8b \n"
|
| MEMACCESS(1)
|
| - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
|
| + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
| "bgt 1b \n"
|
| : "+r"(src_argb), // %0
|
| "+r"(dst_y), // %1
|
| "+r"(pix) // %2
|
| :
|
| - : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
|
| + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
| );
|
| }
|
| #endif // HAS_ARGBTOYROW_NEON
|
| @@ -1606,26 +1607,26 @@
|
| #ifdef HAS_ARGBTOYJROW_NEON
|
| void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
|
| asm volatile (
|
| - "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
|
| - "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
|
| - "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
|
| + "movi v4.8b, #15 \n" // B * 0.11400 coefficient
|
| + "movi v5.8b, #75 \n" // G * 0.58700 coefficient
|
| + "movi v6.8b, #38 \n" // R * 0.29900 coefficient
|
| ".p2align 2 \n"
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
|
| + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
|
| "subs %2, %2, #8 \n" // 8 processed per loop.
|
| - "vmull.u8 q2, d0, d24 \n" // B
|
| - "vmlal.u8 q2, d1, d25 \n" // G
|
| - "vmlal.u8 q2, d2, d26 \n" // R
|
| - "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y
|
| + "umull v3.8h, v0.8b, v4.8b \n" // B
|
| + "umlal v3.8h, v1.8b, v5.8b \n" // G
|
| + "umlal v3.8h, v2.8b, v6.8b \n" // R
|
| + "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y
|
| MEMACCESS(1)
|
| - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
|
| + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
| "bgt 1b \n"
|
| : "+r"(src_argb), // %0
|
| "+r"(dst_y), // %1
|
| "+r"(pix) // %2
|
| :
|
| - : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
|
| + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
|
| );
|
| }
|
| #endif // HAS_ARGBTOYJROW_NEON
|
| @@ -3048,20 +3049,20 @@
|
| ".p2align 2 \n"
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
|
| + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
|
| MEMACCESS(1)
|
| - "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels.
|
| + "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels.
|
| "subs %3, %3, #8 \n" // 8 processed per loop.
|
| - "vmull.u8 q0, d0, d1 \n" // multiply B
|
| - "vmull.u8 q1, d2, d3 \n" // multiply G
|
| - "vmull.u8 q2, d4, d5 \n" // multiply R
|
| - "vmull.u8 q3, d6, d7 \n" // multiply A
|
| - "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B
|
| - "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
|
| - "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
|
| - "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
|
| + "umull v0.8h, v0.8b, v4.8b \n" // multiply B
|
| + "umull v1.8h, v1.8b, v5.8b \n" // multiply G
|
| + "umull v2.8h, v2.8b, v6.8b \n" // multiply R
|
| + "umull v3.8h, v3.8b, v7.8b \n" // multiply A
|
| + "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
|
| + "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
|
| + "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
|
| + "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
|
| MEMACCESS(2)
|
| - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
|
| + "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
|
| "bgt 1b \n"
|
|
|
| : "+r"(src_argb0), // %0
|
| @@ -3069,7 +3070,7 @@
|
| "+r"(dst_argb), // %2
|
| "+r"(width) // %3
|
| :
|
| - : "cc", "memory", "q0", "q1", "q2", "q3"
|
| + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
| );
|
| }
|
| #endif // HAS_ARGBMULTIPLYROW_NEON
|
| @@ -3083,14 +3084,16 @@
|
| ".p2align 2 \n"
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
|
| + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
|
| MEMACCESS(1)
|
| - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
|
| + "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels.
|
| "subs %3, %3, #8 \n" // 8 processed per loop.
|
| - "vqadd.u8 q0, q0, q2 \n" // add B, G
|
| - "vqadd.u8 q1, q1, q3 \n" // add R, A
|
| + "uqadd v0.8b, v0.8b, v4.8b \n"
|
| + "uqadd v1.8b, v1.8b, v5.8b \n"
|
| + "uqadd v2.8b, v2.8b, v6.8b \n"
|
| + "uqadd v3.8b, v3.8b, v7.8b \n"
|
| MEMACCESS(2)
|
| - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
|
| + "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
|
| "bgt 1b \n"
|
|
|
| : "+r"(src_argb0), // %0
|
| @@ -3098,7 +3101,7 @@
|
| "+r"(dst_argb), // %2
|
| "+r"(width) // %3
|
| :
|
| - : "cc", "memory", "q0", "q1", "q2", "q3"
|
| + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
| );
|
| }
|
| #endif // HAS_ARGBADDROW_NEON
|
| @@ -3112,14 +3115,16 @@
|
| ".p2align 2 \n"
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
|
| + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
|
| MEMACCESS(1)
|
| - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
|
| + "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels.
|
| "subs %3, %3, #8 \n" // 8 processed per loop.
|
| - "vqsub.u8 q0, q0, q2 \n" // subtract B, G
|
| - "vqsub.u8 q1, q1, q3 \n" // subtract R, A
|
| + "uqsub v0.8b, v0.8b, v4.8b \n"
|
| + "uqsub v1.8b, v1.8b, v5.8b \n"
|
| + "uqsub v2.8b, v2.8b, v6.8b \n"
|
| + "uqsub v3.8b, v3.8b, v7.8b \n"
|
| MEMACCESS(2)
|
| - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
|
| + "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
|
| "bgt 1b \n"
|
|
|
| : "+r"(src_argb0), // %0
|
| @@ -3127,7 +3132,7 @@
|
| "+r"(dst_argb), // %2
|
| "+r"(width) // %3
|
| :
|
| - : "cc", "memory", "q0", "q1", "q2", "q3"
|
| + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
| );
|
| }
|
| #endif // HAS_ARGBSUBTRACTROW_NEON
|
| @@ -3141,27 +3146,27 @@
|
| void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
|
| uint8* dst_argb, int width) {
|
| asm volatile (
|
| - "vmov.u8 d3, #255 \n" // alpha
|
| + "movi v3.8b, #255 \n" // alpha
|
| // 8 pixel loop.
|
| ".p2align 2 \n"
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
|
| + "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
|
| MEMACCESS(1)
|
| - "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
|
| + "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
|
| "subs %3, %3, #8 \n" // 8 processed per loop.
|
| - "vqadd.u8 d0, d0, d1 \n" // add
|
| - "vmov.u8 d1, d0 \n"
|
| - "vmov.u8 d2, d0 \n"
|
| + "uqadd v0.8b, v0.8b, v1.8b \n" // add
|
| + "mov v1.8b, v0.8b \n"
|
| + "mov v2.8b, v0.8b \n"
|
| MEMACCESS(2)
|
| - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
|
| + "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
|
| "bgt 1b \n"
|
| : "+r"(src_sobelx), // %0
|
| "+r"(src_sobely), // %1
|
| "+r"(dst_argb), // %2
|
| "+r"(width) // %3
|
| :
|
| - : "cc", "memory", "q0", "q1"
|
| + : "cc", "memory", "v0", "v1", "v2", "v3"
|
| );
|
| }
|
| #endif // HAS_SOBELROW_NEON
|
| @@ -3175,20 +3180,20 @@
|
| ".p2align 2 \n"
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
|
| + "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
|
| MEMACCESS(1)
|
| - "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
|
| + "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
|
| "subs %3, %3, #16 \n" // 16 processed per loop.
|
| - "vqadd.u8 q0, q0, q1 \n" // add
|
| + "uqadd v0.16b, v0.16b, v1.16b \n" // add
|
| MEMACCESS(2)
|
| - "vst1.8 {q0}, [%2]! \n" // store 16 pixels.
|
| + "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
|
| "bgt 1b \n"
|
| : "+r"(src_sobelx), // %0
|
| "+r"(src_sobely), // %1
|
| "+r"(dst_y), // %2
|
| "+r"(width) // %3
|
| :
|
| - : "cc", "memory", "q0", "q1"
|
| + : "cc", "memory", "v0", "v1"
|
| );
|
| }
|
| #endif // HAS_SOBELTOPLANEROW_NEON
|
| @@ -3202,25 +3207,25 @@
|
| void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
|
| uint8* dst_argb, int width) {
|
| asm volatile (
|
| - "vmov.u8 d3, #255 \n" // alpha
|
| + "movi v3.8b, #255 \n" // alpha
|
| // 8 pixel loop.
|
| ".p2align 2 \n"
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
|
| + "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
|
| MEMACCESS(1)
|
| - "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
|
| + "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
|
| "subs %3, %3, #8 \n" // 8 processed per loop.
|
| - "vqadd.u8 d1, d0, d2 \n" // add
|
| + "uqadd v1.8b, v0.8b, v2.8b \n" // add
|
| MEMACCESS(2)
|
| - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
|
| + "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
|
| "bgt 1b \n"
|
| : "+r"(src_sobelx), // %0
|
| "+r"(src_sobely), // %1
|
| "+r"(dst_argb), // %2
|
| "+r"(width) // %3
|
| :
|
| - : "cc", "memory", "q0", "q1"
|
| + : "cc", "memory", "v0", "v1", "v2", "v3"
|
| );
|
| }
|
| #endif // HAS_SOBELXYROW_NEON
|
| @@ -3236,28 +3241,28 @@
|
| ".p2align 2 \n"
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld1.8 {d0}, [%0],%5 \n" // top
|
| + "ld1 {v0.8b}, [%0],%5 \n" // top
|
| MEMACCESS(0)
|
| - "vld1.8 {d1}, [%0],%6 \n"
|
| - "vsubl.u8 q0, d0, d1 \n"
|
| + "ld1 {v1.8b}, [%0],%6 \n"
|
| + "usubl v0.8h, v0.8b, v1.8b \n"
|
| MEMACCESS(1)
|
| - "vld1.8 {d2}, [%1],%5 \n" // center * 2
|
| + "ld1 {v2.8b}, [%1],%5 \n" // center * 2
|
| MEMACCESS(1)
|
| - "vld1.8 {d3}, [%1],%6 \n"
|
| - "vsubl.u8 q1, d2, d3 \n"
|
| - "vadd.s16 q0, q0, q1 \n"
|
| - "vadd.s16 q0, q0, q1 \n"
|
| + "ld1 {v3.8b}, [%1],%6 \n"
|
| + "usubl v1.8h, v2.8b, v3.8b \n"
|
| + "add v0.8h, v0.8h, v1.8h \n"
|
| + "add v0.8h, v0.8h, v1.8h \n"
|
| MEMACCESS(2)
|
| - "vld1.8 {d2}, [%2],%5 \n" // bottom
|
| + "ld1 {v2.8b}, [%2],%5 \n" // bottom
|
| MEMACCESS(2)
|
| - "vld1.8 {d3}, [%2],%6 \n"
|
| + "ld1 {v3.8b}, [%2],%6 \n"
|
| "subs %4, %4, #8 \n" // 8 pixels
|
| - "vsubl.u8 q1, d2, d3 \n"
|
| - "vadd.s16 q0, q0, q1 \n"
|
| - "vabs.s16 q0, q0 \n"
|
| - "vqmovn.u16 d0, q0 \n"
|
| + "usubl v1.8h, v2.8b, v3.8b \n"
|
| + "add v0.8h, v0.8h, v1.8h \n"
|
| + "abs v0.8h, v0.8h \n"
|
| + "uqxtn v0.8b, v0.8h \n"
|
| MEMACCESS(3)
|
| - "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
|
| + "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
|
| "bgt 1b \n"
|
| : "+r"(src_y0), // %0
|
| "+r"(src_y1), // %1
|
| @@ -3266,7 +3271,7 @@
|
| "+r"(width) // %4
|
| : "r"(2), // %5
|
| "r"(6) // %6
|
| - : "cc", "memory", "q0", "q1" // Clobber List
|
| + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
|
| );
|
| }
|
| #endif // HAS_SOBELXROW_NEON
|
| @@ -3282,28 +3287,28 @@
|
| ".p2align 2 \n"
|
| "1: \n"
|
| MEMACCESS(0)
|
| - "vld1.8 {d0}, [%0],%4 \n" // left
|
| + "ld1 {v0.8b}, [%0],%4 \n" // left
|
| MEMACCESS(1)
|
| - "vld1.8 {d1}, [%1],%4 \n"
|
| - "vsubl.u8 q0, d0, d1 \n"
|
| + "ld1 {v1.8b}, [%1],%4 \n"
|
| + "usubl v0.8h, v0.8b, v1.8b \n"
|
| MEMACCESS(0)
|
| - "vld1.8 {d2}, [%0],%4 \n" // center * 2
|
| + "ld1 {v2.8b}, [%0],%4 \n" // center * 2
|
| MEMACCESS(1)
|
| - "vld1.8 {d3}, [%1],%4 \n"
|
| - "vsubl.u8 q1, d2, d3 \n"
|
| - "vadd.s16 q0, q0, q1 \n"
|
| - "vadd.s16 q0, q0, q1 \n"
|
| + "ld1 {v3.8b}, [%1],%4 \n"
|
| + "usubl v1.8h, v2.8b, v3.8b \n"
|
| + "add v0.8h, v0.8h, v1.8h \n"
|
| + "add v0.8h, v0.8h, v1.8h \n"
|
| MEMACCESS(0)
|
| - "vld1.8 {d2}, [%0],%5 \n" // right
|
| + "ld1 {v2.8b}, [%0],%5 \n" // right
|
| MEMACCESS(1)
|
| - "vld1.8 {d3}, [%1],%5 \n"
|
| + "ld1 {v3.8b}, [%1],%5 \n"
|
| "subs %3, %3, #8 \n" // 8 pixels
|
| - "vsubl.u8 q1, d2, d3 \n"
|
| - "vadd.s16 q0, q0, q1 \n"
|
| - "vabs.s16 q0, q0 \n"
|
| - "vqmovn.u16 d0, q0 \n"
|
| + "usubl v1.8h, v2.8b, v3.8b \n"
|
| + "add v0.8h, v0.8h, v1.8h \n"
|
| + "abs v0.8h, v0.8h \n"
|
| + "uqxtn v0.8b, v0.8h \n"
|
| MEMACCESS(2)
|
| - "vst1.8 {d0}, [%2]! \n" // store 8 sobely
|
| + "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
|
| "bgt 1b \n"
|
| : "+r"(src_y0), // %0
|
| "+r"(src_y1), // %1
|
| @@ -3311,7 +3316,7 @@
|
| "+r"(width) // %3
|
| : "r"(1), // %4
|
| "r"(6) // %5
|
| - : "cc", "memory", "q0", "q1" // Clobber List
|
| + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
|
| );
|
| }
|
| #endif // HAS_SOBELYROW_NEON
|
|
|