source/row_neon.cc - Issue 2478313004: HalfFloat neon armv7 fix for destination pointer.

Side by Side Diff: source/row_neon.cc

Issue 2478313004: HalfFloat neon armv7 fix for destination pointer. (Closed)

Patch Set: added caveat on rounding difference Created 4 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.	2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 2724 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2735 }	2735 }

2736	2736

2737 void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {	2737 void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {

2738 asm volatile (	2738 asm volatile (

2739 "vdup.32 q0, %3 \n"	2739 "vdup.32 q0, %3 \n"

2740	2740

2741 "1: \n"	2741 "1: \n"

2742 MEMACCESS(0)	2742 MEMACCESS(0)

2743 "vld1.8 {q1}, [%0]! \n" // load 8 shorts	2743 "vld1.8 {q1}, [%0]! \n" // load 8 shorts

2744 "subs %2, %2, #8 \n" // 8 pixels per loop	2744 "subs %2, %2, #8 \n" // 8 pixels per loop

2745 "vmovl.u8 q2, d2 \n" // 8 int's	2745 "vmovl.u16 q2, d2 \n" // 8 int's

2746 "vmovl.u8 q3, d3 \n"	2746 "vmovl.u16 q3, d3 \n"

2747 "vcvt.f32.u32 q2, q2 \n" // 8 floats	2747 "vcvt.f32.u32 q2, q2 \n" // 8 floats

2748 "vcvt.f32.u32 q3, q3 \n"	2748 "vcvt.f32.u32 q3, q3 \n"

2749 "vmul.f32 q2, q2, q0 \n" // adjust exponent	2749 "vmul.f32 q2, q2, q0 \n" // adjust exponent

2750 "vmul.f32 q3, q3, q0 \n"	2750 "vmul.f32 q3, q3, q0 \n"

2751 "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat	2751 "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat

2752 "vqshrn.u32 d3, q3, #13 \n"	2752 "vqshrn.u32 d3, q3, #13 \n"

2753 MEMACCESS(1)	2753 MEMACCESS(1)

2754 "vst1.8 {q1}, [%0]! \n"	2754 "vst1.8 {q1}, [%1]! \n"

2755 "bgt 1b \n"	2755 "bgt 1b \n"

2756 : "+r"(src), // %0	2756 : "+r"(src), // %0

2757 "+r"(dst), // %1	2757 "+r"(dst), // %1

2758 "+r"(width) // %2	2758 "+r"(width) // %2

2759 : "r"(1.9259299444e-34f) // %3	2759 : "r"(1.9259299444e-34f) // %3

2760 : "cc", "memory", "q0", "q1", "q2", "q3"	2760 : "cc", "memory", "q0", "q1", "q2", "q3"

2761 );	2761 );

2762 }	2762 }

2763	2763

2764 // TODO(fbarchard): multiply by element.	2764 // TODO(fbarchard): multiply by element.

2765 void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {	2765 void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {

2766 asm volatile (	2766 asm volatile (

2767 "vdup.32 q0, %3 \n"	2767 "vdup.32 q0, %3 \n"

2768	2768

2769 "1: \n"	2769 "1: \n"

2770 MEMACCESS(0)	2770 MEMACCESS(0)

2771 "vld1.8 {q1}, [%0]! \n" // load 8 shorts	2771 "vld1.8 {q1}, [%0]! \n" // load 8 shorts

2772 "subs %2, %2, #8 \n" // 8 pixels per loop	2772 "subs %2, %2, #8 \n" // 8 pixels per loop

2773 "vmovl.u8 q2, d2 \n" // 8 int's	2773 "vmovl.u16 q2, d2 \n" // 8 int's

2774 "vmovl.u8 q3, d3 \n"	2774 "vmovl.u16 q3, d3 \n"

2775 "vcvt.f32.u32 q2, q2 \n" // 8 floats	2775 "vcvt.f32.u32 q2, q2 \n" // 8 floats

2776 "vcvt.f32.u32 q3, q3 \n"	2776 "vcvt.f32.u32 q3, q3 \n"

2777 "vmul.f32 q2, q2, q0 \n" // adjust exponent	2777 "vmul.f32 q2, q2, q0 \n" // adjust exponent

2778 "vmul.f32 q3, q3, q0 \n"	2778 "vmul.f32 q3, q3, q0 \n"

2779 "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat	2779 "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat

2780 "vqshrn.u32 d3, q3, #13 \n"	2780 "vqshrn.u32 d3, q3, #13 \n"

2781 MEMACCESS(1)	2781 MEMACCESS(1)

2782 "vst1.8 {q1}, [%0]! \n"	2782 "vst1.8 {q1}, [%1]! \n"

2783 "bgt 1b \n"	2783 "bgt 1b \n"

2784 : "+r"(src), // %0	2784 : "+r"(src), // %0

2785 "+r"(dst), // %1	2785 "+r"(dst), // %1

2786 "+r"(width) // %2	2786 "+r"(width) // %2

2787 : "r"(scale * 1.9259299444e-34f) // %3	2787 : "r"(scale * 1.9259299444e-34f) // %3

2788 : "cc", "memory", "q0", "q1", "q2", "q3"	2788 : "cc", "memory", "q0", "q1", "q2", "q3"

2789 );	2789 );

2790 }	2790 }

2791	2791

2792 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..	2792 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..

2793	2793

2794 #ifdef __cplusplus	2794 #ifdef __cplusplus

2795 } // extern "C"	2795 } // extern "C"

2796 } // namespace libyuv	2796 } // namespace libyuv

2797 #endif	2797 #endif

OLD	NEW

« no previous file with comments | « include/libyuv/version.h ('k') | source/row_neon64.cc » ('j') | no next file with comments »