OLD | NEW |
---|---|
1 /* | 1 /* |
2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 83 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
94 ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1); | 94 ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1); |
95 ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3); | 95 ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3); |
96 ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16); | 96 ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16); |
97 src_u += 16; | 97 src_u += 16; |
98 src_v += 16; | 98 src_v += 16; |
99 src_y += 32; | 99 src_y += 32; |
100 dst_uyvy += 64; | 100 dst_uyvy += 64; |
101 } | 101 } |
102 } | 102 } |
103 | 103 |
104 void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width) { | |
105 int x; | |
106 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; | |
107 v16u8 dst0, dst1, dst2, dst3; | |
108 | |
109 for (x = 0; x < width; x += 64) { | |
fbarchard1
2016/10/05 22:03:58
suggest 32 at a time... or 16. 4 of everything re
manojkumar.bhosale
2016/10/07 10:42:09
We have done loop unrolling considering the load l
| |
110 LD_UB8(src_yuy2, 16, src0, src1, src2, src3, src4, src5, src6, src7); | |
111 PCKEV_B4_UB(src1, src0, src3, src2, src5, src4, src7, src6, | |
112 dst0, dst1, dst2, dst3); | |
113 ST_UB4(dst0, dst1, dst2, dst3, dst_y, 16); | |
114 src_yuy2 += 128; | |
115 dst_y += 64; | |
fbarchard1
2016/10/05 22:03:58
this is unrolled quite alot. other platforms are
manojkumar.bhosale
2016/10/07 10:42:09
Done.
| |
116 } | |
117 } | |
118 | |
119 void YUY2ToUVRow_MSA(const uint8* src_yuy2, int src_stride_yuy2, | |
120 uint8* dst_u, uint8* dst_v, int width) { | |
121 const uint8* nxt = src_yuy2 + src_stride_yuy2; | |
122 int x; | |
123 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; | |
124 v16u8 vec0, vec1, dst0, dst1; | |
125 | |
126 for (x = 0; x < width; x += 32) { | |
127 LD_UB4(src_yuy2, 16, src0, src1, src2, src3); | |
128 LD_UB4(nxt, 16, src4, src5, src6, src7); | |
129 PCKOD_B4_UB(src1, src0, src3, src2, src5, src4, src7, src6, | |
130 src0, src1, src2, src3); | |
131 AVER_UB2_UB(src0, src2, src1, src3, vec0, vec1); | |
132 dst0 = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec0); | |
fbarchard1
2016/10/05 22:03:58
style nit - i like seeing the intrinisics (or inli
manojkumar.bhosale
2016/10/07 10:42:09
Acknowledged.
| |
133 dst1 = (v16u8) __msa_pckod_b((v16i8) vec1, (v16i8) vec0); | |
134 ST_UB(dst0, dst_u); | |
135 ST_UB(dst1, dst_v); | |
136 src_yuy2 += 64; | |
137 nxt += 64; | |
138 dst_u += 16; | |
139 dst_v += 16; | |
140 } | |
141 } | |
142 | |
143 void YUY2ToUV422Row_MSA(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, | |
fbarchard1
2016/10/05 22:03:58
prefer this be same as YUY2TToUV and do 32 at a ti
manojkumar.bhosale
2016/10/07 10:42:09
Done.
| |
144 int width) { | |
145 int x; | |
146 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; | |
147 v16u8 dst0, dst1, dst2, dst3; | |
148 | |
149 for (x = 0; x < width; x += 64) { | |
150 LD_UB8(src_yuy2, 16, src0, src1, src2, src3, src4, src5, src6, src7); | |
151 PCKOD_B4_UB(src1, src0, src3, src2, src5, src4, src7, src6, | |
152 src0, src1, src2, src3); | |
153 PCKEV_B2_UB(src1, src0, src3, src2, dst0, dst1); | |
154 PCKOD_B2_UB(src1, src0, src3, src2, dst2, dst3); | |
155 ST_UB2(dst0, dst1, dst_u, 16); | |
156 ST_UB2(dst2, dst3, dst_v, 16); | |
157 src_yuy2 += 128; | |
158 dst_u += 32; | |
159 dst_v += 32; | |
160 } | |
161 } | |
162 | |
163 void UYVYToYRow_MSA(const uint8* src_uyvy, uint8* dst_y, int width) { | |
164 int x; | |
165 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; | |
166 v16u8 dst0, dst1, dst2, dst3; | |
167 | |
168 for (x = 0; x < width; x += 64) { | |
169 LD_UB8(src_uyvy, 16, src0, src1, src2, src3, src4, src5, src6, src7); | |
170 PCKOD_B4_UB(src1, src0, src3, src2, src5, src4, src7, src6, | |
171 dst0, dst1, dst2, dst3); | |
172 ST_UB4(dst0, dst1, dst2, dst3, dst_y, 16); | |
173 src_uyvy += 128; | |
174 dst_y += 64; | |
175 } | |
176 } | |
177 | |
178 void UYVYToUVRow_MSA(const uint8* src_uyvy, int src_stride_uyvy, | |
179 uint8* dst_u, uint8* dst_v, int width) { | |
180 const uint8 *nxt = src_uyvy + src_stride_uyvy; | |
181 int x; | |
182 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; | |
183 v16u8 vec0, vec1, dst0, dst1; | |
184 | |
185 for (x = 0; x < width; x += 32) { | |
186 LD_UB4(src_uyvy, 16, src0, src1, src2, src3); | |
187 LD_UB4(nxt, 16, src4, src5, src6, src7); | |
188 PCKEV_B4_UB(src1, src0, src3, src2, src5, src4, src7, src6, | |
189 src0, src1, src2, src3); | |
190 AVER_UB2_UB(src0, src2, src1, src3, vec0, vec1); | |
191 dst0 = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec0); | |
192 dst1 = (v16u8) __msa_pckod_b((v16i8) vec1, (v16i8) vec0); | |
fbarchard1
2016/10/05 22:03:58
the i8 version of vectors seems to be preferred?
manojkumar.bhosale
2016/10/07 10:42:09
As we are operating on pixel data (u8), we have us
| |
193 ST_UB(dst0, dst_u); | |
194 ST_UB(dst1, dst_v); | |
195 src_uyvy += 64; | |
196 nxt += 64; | |
197 dst_u += 16; | |
198 dst_v += 16; | |
199 } | |
200 } | |
201 | |
202 void UYVYToUV422Row_MSA(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, | |
203 int width) { | |
204 int x; | |
205 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; | |
206 v16u8 dst0, dst1, dst2, dst3; | |
207 | |
208 for (x = 0; x < width; x += 64) { | |
209 LD_UB8(src_uyvy, 16, src0, src1, src2, src3, src4, src5, src6, src7); | |
210 PCKEV_B4_UB(src1, src0, src3, src2, src5, src4, src7, src6, | |
211 src0, src1, src2, src3); | |
212 PCKEV_B2_UB(src1, src0, src3, src2, dst0, dst1); | |
213 PCKOD_B2_UB(src1, src0, src3, src2, dst2, dst3); | |
214 ST_UB2(dst0, dst1, dst_u, 16); | |
215 ST_UB2(dst2, dst3, dst_v, 16); | |
216 src_uyvy += 128; | |
217 dst_u += 32; | |
218 dst_v += 32; | |
219 } | |
220 } | |
221 | |
104 #ifdef __cplusplus | 222 #ifdef __cplusplus |
105 } // extern "C" | 223 } // extern "C" |
106 } // namespace libyuv | 224 } // namespace libyuv |
107 #endif | 225 #endif |
108 | 226 |
109 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) | 227 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) |
OLD | NEW |