OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #ifndef INCLUDE_LIBYUV_MACROS_MSA_H_ | 11 #ifndef INCLUDE_LIBYUV_MACROS_MSA_H_ |
12 #define INCLUDE_LIBYUV_MACROS_MSA_H_ | 12 #define INCLUDE_LIBYUV_MACROS_MSA_H_ |
13 | 13 |
14 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) | 14 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) |
15 #include <stdint.h> | 15 #include <stdint.h> |
16 #include <msa.h> | 16 #include <msa.h> |
17 | 17 |
18 #if (__mips_isa_rev >= 6) | 18 #if (__mips_isa_rev >= 6) |
19 #define LW(psrc) ({ \ | 19 #define LW(psrc) ({ \ |
20 uint8* psrc_lw_m = (uint8*) (psrc); \ /* NOLINT */ | 20 uint8* psrc_lw_m = (uint8*) (psrc); /* NOLINT */ \ |
21 uint32 val_m; \ | 21 uint32 val_m; \ |
22 asm volatile ( \ | 22 asm volatile ( \ |
23 "lw %[val_m], %[psrc_lw_m] \n\t" \ | 23 "lw %[val_m], %[psrc_lw_m] \n\t" \ |
24 : [val_m] "=r" (val_m) \ | 24 : [val_m] "=r" (val_m) \ |
25 : [psrc_lw_m] "m" (*psrc_lw_m) \ | 25 : [psrc_lw_m] "m" (*psrc_lw_m) \ |
26 ); \ | 26 ); \ |
27 \ | 27 val_m; \ |
28 val_m; \ | |
29 }) | 28 }) |
30 | 29 |
31 #if (__mips == 64) | 30 #if (__mips == 64) |
32 #define LD(psrc) ({ \ | 31 #define LD(psrc) ({ \ |
33 uint8* psrc_ld_m = (uint8*) (psrc); \ /* NOLINT */ | 32 uint8* psrc_ld_m = (uint8*) (psrc); /* NOLINT */ \ |
34 uint64 val_m = 0; \ | 33 uint64 val_m = 0; \ |
35 asm volatile ( \ | 34 asm volatile ( \ |
36 "ld %[val_m], %[psrc_ld_m] \n\t" \ | 35 "ld %[val_m], %[psrc_ld_m] \n\t" \ |
37 : [val_m] "=r" (val_m) \ | 36 : [val_m] "=r" (val_m) \ |
38 : [psrc_ld_m] "m" (*psrc_ld_m) \ | 37 : [psrc_ld_m] "m" (*psrc_ld_m) \ |
39 ); \ | 38 ); \ |
40 val_m; \ | 39 val_m; \ |
41 }) | 40 }) |
42 #else // !(__mips == 64) | 41 #else // !(__mips == 64) |
43 #define LD(psrc) ({ \ | 42 #define LD(psrc) ({ \ |
44 uint8* psrc_ld_m = (uint8*) (psrc); \ /* NOLINT */ | 43 uint8* psrc_ld_m = (uint8*) (psrc); /* NOLINT */ \ |
45 uint32 val0_m, val1_m; \ | 44 uint32 val0_m, val1_m; \ |
46 uint64 val_m = 0; \ | 45 uint64 val_m = 0; \ |
47 val0_m = LW(psrc_ld_m); \ | 46 val0_m = LW(psrc_ld_m); \ |
48 val1_m = LW(psrc_ld_m + 4); \ | 47 val1_m = LW(psrc_ld_m + 4); \ |
49 val_m = (uint64) (val1_m); \ /* NOLINT */ | 48 val_m = (uint64) (val1_m); /* NOLINT */ \ |
50 val_m = (uint64) ((val_m << 32) & 0xFFFFFFFF00000000); \ /* NOLINT */ | 49 val_m = (uint64) ((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \ |
51 val_m = (uint64) (val_m | (uint64) val0_m); \ /* NOLINT */ | 50 val_m = (uint64) (val_m | (uint64) val0_m); /* NOLINT */ \ |
52 val_m; \ | 51 val_m; \ |
53 }) | 52 }) |
54 #endif // (__mips == 64) | 53 #endif // (__mips == 64) |
55 #else // !(__mips_isa_rev >= 6) | 54 #else // !(__mips_isa_rev >= 6) |
56 #define LW(psrc) ({ \ | 55 #define LW(psrc) ({ \ |
57 uint8* psrc_lw_m = (uint8*) (psrc); \ /* NOLINT */ | 56 uint8* psrc_lw_m = (uint8*) (psrc); /* NOLINT */ \ |
58 uint32 val_m; \ | 57 uint32 val_m; \ |
59 asm volatile ( \ | 58 asm volatile ( \ |
60 "ulw %[val_m], %[psrc_lw_m] \n\t" \ | 59 "ulw %[val_m], %[psrc_lw_m] \n\t" \ |
61 : [val_m] "=r" (val_m) \ | 60 : [val_m] "=r" (val_m) \ |
62 : [psrc_lw_m] "m" (*psrc_lw_m) \ | 61 : [psrc_lw_m] "m" (*psrc_lw_m) \ |
63 ); \ | 62 ); \ |
64 val_m; \ | 63 val_m; \ |
65 }) | 64 }) |
66 | 65 |
67 #if (__mips == 64) | 66 #if (__mips == 64) |
68 #define LD(psrc) ({ \ | 67 #define LD(psrc) ({ \ |
69 uint8* psrc_ld_m = (uint8*) (psrc); \ /* NOLINT */ | 68 uint8* psrc_ld_m = (uint8*) (psrc); /* NOLINT */ \ |
70 uint64 val_m = 0; \ | 69 uint64 val_m = 0; \ |
71 asm volatile ( \ | 70 asm volatile ( \ |
72 "uld %[val_m], %[psrc_ld_m] \n\t" \ | 71 "uld %[val_m], %[psrc_ld_m] \n\t" \ |
73 : [val_m] "=r" (val_m) \ | 72 : [val_m] "=r" (val_m) \ |
74 : [psrc_ld_m] "m" (*psrc_ld_m) \ | 73 : [psrc_ld_m] "m" (*psrc_ld_m) \ |
75 ); \ | 74 ); \ |
76 val_m; \ | 75 val_m; \ |
77 }) | 76 }) |
78 #else // !(__mips == 64) | 77 #else // !(__mips == 64) |
79 #define LD(psrc) ({ \ | 78 #define LD(psrc) ({ \ |
80 uint8* psrc_ld_m = (uint8*) (psrc); \ /* NOLINT */ | 79 uint8* psrc_ld_m = (uint8*) (psrc); /* NOLINT */ \ |
81 uint32 val0_m, val1_m; \ | 80 uint32 val0_m, val1_m; \ |
82 uint64 val_m = 0; \ | 81 uint64 val_m = 0; \ |
83 val0_m = LW(psrc_ld_m); \ | 82 val0_m = LW(psrc_ld_m); \ |
84 val1_m = LW(psrc_ld_m + 4); \ | 83 val1_m = LW(psrc_ld_m + 4); \ |
85 val_m = (uint64) (val1_m); \ /* NOLINT */ | 84 val_m = (uint64) (val1_m); /* NOLINT */ \ |
86 val_m = (uint64) ((val_m << 32) & 0xFFFFFFFF00000000); \ /* NOLINT */ | 85 val_m = (uint64) ((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \ |
87 val_m = (uint64) (val_m | (uint64) val0_m); \ /* NOLINT */ | 86 val_m = (uint64) (val_m | (uint64) val0_m); /* NOLINT */ \ |
88 val_m; \ | 87 val_m; \ |
89 }) | 88 }) |
90 #endif // (__mips == 64) | 89 #endif // (__mips == 64) |
91 #endif // (__mips_isa_rev >= 6) | 90 #endif // (__mips_isa_rev >= 6) |
92 | 91 |
93 // TODO(fbarchard): Consider removing __VAR_ARGS versions. | 92 // TODO(fbarchard): Consider removing __VAR_ARGS versions. |
94 #define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */ | 93 #define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */ |
95 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__) | 94 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__) |
96 | 95 |
97 #define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */ | 96 #define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */ |
98 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__) | 97 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__) |
99 | 98 |
100 /* Description : Load two vectors with 16 'byte' sized elements | 99 /* Description : Load two vectors with 16 'byte' sized elements |
101 Arguments : Inputs - psrc, stride | 100 Arguments : Inputs - psrc, stride |
102 Outputs - out0, out1 | 101 Outputs - out0, out1 |
103 Return Type - as per RTYPE | 102 Return Type - as per RTYPE |
104 Details : Load 16 byte elements in 'out0' from (psrc) | 103 Details : Load 16 byte elements in 'out0' from (psrc) |
105 Load 16 byte elements in 'out1' from (psrc + stride) | 104 Load 16 byte elements in 'out1' from (psrc + stride) |
106 */ | 105 */ |
107 #define LD_B2(RTYPE, psrc, stride, out0, out1) { \ | 106 #define LD_B2(RTYPE, psrc, stride, out0, out1) { \ |
108 out0 = LD_B(RTYPE, (psrc)); \ | 107 out0 = LD_B(RTYPE, (psrc)); \ |
109 out1 = LD_B(RTYPE, (psrc) + stride); \ | 108 out1 = LD_B(RTYPE, (psrc) + stride); \ |
110 } | 109 } |
111 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) | 110 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) |
112 | 111 |
113 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) { \ | 112 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) { \ |
114 LD_B2(RTYPE, (psrc), stride, out0, out1); \ | 113 LD_B2(RTYPE, (psrc), stride, out0, out1); \ |
115 LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \ | 114 LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \ |
116 } | 115 } |
117 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) | 116 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) |
118 | 117 |
119 /* Description : Store two vectors with stride each having 16 'byte' sized | 118 /* Description : Store two vectors with stride each having 16 'byte' sized |
120 elements | 119 elements |
121 Arguments : Inputs - in0, in1, pdst, stride | 120 Arguments : Inputs - in0, in1, pdst, stride |
122 Details : Store 16 byte elements from 'in0' to (pdst) | 121 Details : Store 16 byte elements from 'in0' to (pdst) |
123 Store 16 byte elements from 'in1' to (pdst + stride) | 122 Store 16 byte elements from 'in1' to (pdst + stride) |
124 */ | 123 */ |
125 #define ST_B2(RTYPE, in0, in1, pdst, stride) { \ | 124 #define ST_B2(RTYPE, in0, in1, pdst, stride) { \ |
126 ST_B(RTYPE, in0, (pdst)); \ | 125 ST_B(RTYPE, in0, (pdst)); \ |
127 ST_B(RTYPE, in1, (pdst) + stride); \ | 126 ST_B(RTYPE, in1, (pdst) + stride); \ |
128 } | 127 } |
129 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) | 128 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) |
130 | 129 |
131 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) { \ | 130 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) { \ |
132 ST_B2(RTYPE, in0, in1, (pdst), stride); \ | 131 ST_B2(RTYPE, in0, in1, (pdst), stride); \ |
133 ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ | 132 ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ |
134 } | 133 } |
135 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) | 134 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) |
136 | 135 |
137 // TODO(fbarchard): Consider using __msa_vshf_b and __msa_ilvr_b directly. | 136 // TODO(fbarchard): Consider using __msa_vshf_b and __msa_ilvr_b directly. |
138 /* Description : Shuffle byte vector elements as per mask vector | 137 /* Description : Shuffle byte vector elements as per mask vector |
139 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 | 138 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 |
140 Outputs - out0, out1 | 139 Outputs - out0, out1 |
141 Return Type - as per RTYPE | 140 Return Type - as per RTYPE |
142 Details : Byte elements from 'in0' & 'in1' are copied selectively to | 141 Details : Byte elements from 'in0' & 'in1' are copied selectively to |
143 'out0' as per control vector 'mask0' | 142 'out0' as per control vector 'mask0' |
144 */ | 143 */ |
145 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) { \ | 144 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) { \ |
146 out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \ | 145 out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \ |
147 out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \ | 146 out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \ |
148 } | 147 } |
149 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) | 148 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) |
150 | 149 |
151 /* Description : Interleave both left and right half of input vectors | 150 /* Description : Interleave both left and right half of input vectors |
152 Arguments : Inputs - in0, in1 | 151 Arguments : Inputs - in0, in1 |
153 Outputs - out0, out1 | 152 Outputs - out0, out1 |
154 Return Type - as per RTYPE | 153 Return Type - as per RTYPE |
155 Details : Right half of byte elements from 'in0' and 'in1' are | 154 Details : Right half of byte elements from 'in0' and 'in1' are |
156 interleaved and written to 'out0' | 155 interleaved and written to 'out0' |
157 */ | 156 */ |
158 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) { \ | 157 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) { \ |
159 out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \ | 158 out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \ |
160 out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \ | 159 out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \ |
161 } | 160 } |
162 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) | 161 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) |
163 | 162 |
164 #endif /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */ | 163 #endif /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */ |
165 | 164 |
166 #endif // INCLUDE_LIBYUV_MACROS_MSA_H_ | 165 #endif // INCLUDE_LIBYUV_MACROS_MSA_H_ |
OLD | NEW |