Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(136)

Side by Side Diff: source/row_msa.cc

Issue 2397693002: Add MSA optimized YUY2ToI422, YUY2ToI420, UYVYToI422, UYVYToI420 functions (Closed)
Patch Set: Created 4 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« source/planar_functions.cc ('K') | « source/row_any.cc ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2016 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 83 matching lines...) Expand 10 before | Expand all | Expand 10 after
94 ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1); 94 ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1);
95 ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3); 95 ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3);
96 ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16); 96 ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16);
97 src_u += 16; 97 src_u += 16;
98 src_v += 16; 98 src_v += 16;
99 src_y += 32; 99 src_y += 32;
100 dst_uyvy += 64; 100 dst_uyvy += 64;
101 } 101 }
102 } 102 }
103 103
104 void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width) {
105 int x;
106 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
107 v16u8 dst0, dst1, dst2, dst3;
108
109 for (x = 0; x < width; x += 64) {
fbarchard1 2016/10/05 22:03:58 suggest 32 at a time... or 16. 4 of everything re
manojkumar.bhosale 2016/10/07 10:42:09 We have done loop unrolling considering the load l
110 LD_UB8(src_yuy2, 16, src0, src1, src2, src3, src4, src5, src6, src7);
111 PCKEV_B4_UB(src1, src0, src3, src2, src5, src4, src7, src6,
112 dst0, dst1, dst2, dst3);
113 ST_UB4(dst0, dst1, dst2, dst3, dst_y, 16);
114 src_yuy2 += 128;
115 dst_y += 64;
fbarchard1 2016/10/05 22:03:58 this is unrolled quite alot. other platforms are
manojkumar.bhosale 2016/10/07 10:42:09 Done.
116 }
117 }
118
119 void YUY2ToUVRow_MSA(const uint8* src_yuy2, int src_stride_yuy2,
120 uint8* dst_u, uint8* dst_v, int width) {
121 const uint8* nxt = src_yuy2 + src_stride_yuy2;
122 int x;
123 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
124 v16u8 vec0, vec1, dst0, dst1;
125
126 for (x = 0; x < width; x += 32) {
127 LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
128 LD_UB4(nxt, 16, src4, src5, src6, src7);
129 PCKOD_B4_UB(src1, src0, src3, src2, src5, src4, src7, src6,
130 src0, src1, src2, src3);
131 AVER_UB2_UB(src0, src2, src1, src3, vec0, vec1);
132 dst0 = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec0);
fbarchard1 2016/10/05 22:03:58 style nit - i like seeing the intrinisics (or inli
manojkumar.bhosale 2016/10/07 10:42:09 Acknowledged.
133 dst1 = (v16u8) __msa_pckod_b((v16i8) vec1, (v16i8) vec0);
134 ST_UB(dst0, dst_u);
135 ST_UB(dst1, dst_v);
136 src_yuy2 += 64;
137 nxt += 64;
138 dst_u += 16;
139 dst_v += 16;
140 }
141 }
142
143 void YUY2ToUV422Row_MSA(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
fbarchard1 2016/10/05 22:03:58 prefer this be same as YUY2TToUV and do 32 at a ti
manojkumar.bhosale 2016/10/07 10:42:09 Done.
144 int width) {
145 int x;
146 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
147 v16u8 dst0, dst1, dst2, dst3;
148
149 for (x = 0; x < width; x += 64) {
150 LD_UB8(src_yuy2, 16, src0, src1, src2, src3, src4, src5, src6, src7);
151 PCKOD_B4_UB(src1, src0, src3, src2, src5, src4, src7, src6,
152 src0, src1, src2, src3);
153 PCKEV_B2_UB(src1, src0, src3, src2, dst0, dst1);
154 PCKOD_B2_UB(src1, src0, src3, src2, dst2, dst3);
155 ST_UB2(dst0, dst1, dst_u, 16);
156 ST_UB2(dst2, dst3, dst_v, 16);
157 src_yuy2 += 128;
158 dst_u += 32;
159 dst_v += 32;
160 }
161 }
162
163 void UYVYToYRow_MSA(const uint8* src_uyvy, uint8* dst_y, int width) {
164 int x;
165 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
166 v16u8 dst0, dst1, dst2, dst3;
167
168 for (x = 0; x < width; x += 64) {
169 LD_UB8(src_uyvy, 16, src0, src1, src2, src3, src4, src5, src6, src7);
170 PCKOD_B4_UB(src1, src0, src3, src2, src5, src4, src7, src6,
171 dst0, dst1, dst2, dst3);
172 ST_UB4(dst0, dst1, dst2, dst3, dst_y, 16);
173 src_uyvy += 128;
174 dst_y += 64;
175 }
176 }
177
178 void UYVYToUVRow_MSA(const uint8* src_uyvy, int src_stride_uyvy,
179 uint8* dst_u, uint8* dst_v, int width) {
180 const uint8 *nxt = src_uyvy + src_stride_uyvy;
181 int x;
182 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
183 v16u8 vec0, vec1, dst0, dst1;
184
185 for (x = 0; x < width; x += 32) {
186 LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
187 LD_UB4(nxt, 16, src4, src5, src6, src7);
188 PCKEV_B4_UB(src1, src0, src3, src2, src5, src4, src7, src6,
189 src0, src1, src2, src3);
190 AVER_UB2_UB(src0, src2, src1, src3, vec0, vec1);
191 dst0 = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec0);
192 dst1 = (v16u8) __msa_pckod_b((v16i8) vec1, (v16i8) vec0);
fbarchard1 2016/10/05 22:03:58 the i8 version of vectors seems to be preferred?
manojkumar.bhosale 2016/10/07 10:42:09 As we are operating on pixel data (u8), we have us
193 ST_UB(dst0, dst_u);
194 ST_UB(dst1, dst_v);
195 src_uyvy += 64;
196 nxt += 64;
197 dst_u += 16;
198 dst_v += 16;
199 }
200 }
201
202 void UYVYToUV422Row_MSA(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
203 int width) {
204 int x;
205 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
206 v16u8 dst0, dst1, dst2, dst3;
207
208 for (x = 0; x < width; x += 64) {
209 LD_UB8(src_uyvy, 16, src0, src1, src2, src3, src4, src5, src6, src7);
210 PCKEV_B4_UB(src1, src0, src3, src2, src5, src4, src7, src6,
211 src0, src1, src2, src3);
212 PCKEV_B2_UB(src1, src0, src3, src2, dst0, dst1);
213 PCKOD_B2_UB(src1, src0, src3, src2, dst2, dst3);
214 ST_UB2(dst0, dst1, dst_u, 16);
215 ST_UB2(dst2, dst3, dst_v, 16);
216 src_uyvy += 128;
217 dst_u += 32;
218 dst_v += 32;
219 }
220 }
221
104 #ifdef __cplusplus 222 #ifdef __cplusplus
105 } // extern "C" 223 } // extern "C"
106 } // namespace libyuv 224 } // namespace libyuv
107 #endif 225 #endif
108 226
109 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) 227 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
OLDNEW
« source/planar_functions.cc ('K') | « source/row_any.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698