OLD | NEW |
| (Empty) |
1 ; | |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | |
3 ; | |
4 ; Use of this source code is governed by a BSD-style license | |
5 ; that can be found in the LICENSE file in the root of the source | |
6 ; tree. An additional intellectual property rights grant can be found | |
7 ; in the file PATENTS. All contributing project authors may | |
8 ; be found in the AUTHORS file in the root of the source tree. | |
9 ; | |
10 | |
11 | |
12 EXPORT |vp8_mse16x16_neon| | |
13 EXPORT |vp8_get4x4sse_cs_neon| | |
14 | |
15 ARM | |
16 REQUIRE8 | |
17 PRESERVE8 | |
18 | |
19 AREA ||.text||, CODE, READONLY, ALIGN=2 | |
20 ;============================ | |
21 ; r0 unsigned char *src_ptr | |
22 ; r1 int source_stride | |
23 ; r2 unsigned char *ref_ptr | |
24 ; r3 int recon_stride | |
25 ; stack unsigned int *sse | |
26 ;note: in this function, sum is never used. So, we can remove this part of calcu
lation | |
27 ;from vp8_variance(). | |
28 | |
29 |vp8_mse16x16_neon| PROC | |
30 vpush {q7} | |
31 | |
32 vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse | |
33 vmov.i8 q8, #0 | |
34 vmov.i8 q9, #0 | |
35 vmov.i8 q10, #0 | |
36 | |
37 mov r12, #8 | |
38 | |
39 mse16x16_neon_loop | |
40 vld1.8 {q0}, [r0], r1 ;Load up source and reference | |
41 vld1.8 {q2}, [r2], r3 | |
42 vld1.8 {q1}, [r0], r1 | |
43 vld1.8 {q3}, [r2], r3 | |
44 | |
45 vsubl.u8 q11, d0, d4 | |
46 vsubl.u8 q12, d1, d5 | |
47 vsubl.u8 q13, d2, d6 | |
48 vsubl.u8 q14, d3, d7 | |
49 | |
50 vmlal.s16 q7, d22, d22 | |
51 vmlal.s16 q8, d23, d23 | |
52 | |
53 subs r12, r12, #1 | |
54 | |
55 vmlal.s16 q9, d24, d24 | |
56 vmlal.s16 q10, d25, d25 | |
57 vmlal.s16 q7, d26, d26 | |
58 vmlal.s16 q8, d27, d27 | |
59 vmlal.s16 q9, d28, d28 | |
60 vmlal.s16 q10, d29, d29 | |
61 | |
62 bne mse16x16_neon_loop | |
63 | |
64 vadd.u32 q7, q7, q8 | |
65 vadd.u32 q9, q9, q10 | |
66 | |
67 ldr r12, [sp, #16] ;load *sse from stack | |
68 | |
69 vadd.u32 q10, q7, q9 | |
70 vpaddl.u32 q1, q10 | |
71 vadd.u64 d0, d2, d3 | |
72 | |
73 vst1.32 {d0[0]}, [r12] | |
74 vmov.32 r0, d0[0] | |
75 | |
76 vpop {q7} | |
77 bx lr | |
78 | |
79 ENDP | |
80 | |
81 | |
82 ;============================= | |
83 ; r0 unsigned char *src_ptr, | |
84 ; r1 int source_stride, | |
85 ; r2 unsigned char *ref_ptr, | |
86 ; r3 int recon_stride | |
87 |vp8_get4x4sse_cs_neon| PROC | |
88 vpush {q7} | |
89 | |
90 vld1.8 {d0}, [r0], r1 ;Load up source and reference | |
91 vld1.8 {d4}, [r2], r3 | |
92 vld1.8 {d1}, [r0], r1 | |
93 vld1.8 {d5}, [r2], r3 | |
94 vld1.8 {d2}, [r0], r1 | |
95 vld1.8 {d6}, [r2], r3 | |
96 vld1.8 {d3}, [r0], r1 | |
97 vld1.8 {d7}, [r2], r3 | |
98 | |
99 vsubl.u8 q11, d0, d4 | |
100 vsubl.u8 q12, d1, d5 | |
101 vsubl.u8 q13, d2, d6 | |
102 vsubl.u8 q14, d3, d7 | |
103 | |
104 vmull.s16 q7, d22, d22 | |
105 vmull.s16 q8, d24, d24 | |
106 vmull.s16 q9, d26, d26 | |
107 vmull.s16 q10, d28, d28 | |
108 | |
109 vadd.u32 q7, q7, q8 | |
110 vadd.u32 q9, q9, q10 | |
111 vadd.u32 q9, q7, q9 | |
112 | |
113 vpaddl.u32 q1, q9 | |
114 vadd.u64 d0, d2, d3 | |
115 | |
116 vmov.32 r0, d0[0] | |
117 | |
118 vpop {q7} | |
119 bx lr | |
120 | |
121 ENDP | |
122 | |
123 END | |
OLD | NEW |