Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(249)

Side by Side Diff: source/row_msa.cc

Issue 2641153003: Add MSA optimized ARGB/ABGR/BGRA/RGBA To Y/UV row functions (Closed)
Patch Set: Created 3 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/row_any.cc ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2016 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after
45 out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32)u_m); \ 45 out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32)u_m); \
46 out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32)v_m); \ 46 out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32)v_m); \
47 } 47 }
48 48
49 // Convert 8 pixels of YUV 420 to RGB. 49 // Convert 8 pixels of YUV 420 to RGB.
50 #define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \ 50 #define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \
51 { \ 51 { \
52 v8i16 vec0_m, vec1_m; \ 52 v8i16 vec0_m, vec1_m; \
53 v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ 53 v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \
54 v4i32 reg5_m, reg6_m, reg7_m; \ 54 v4i32 reg5_m, reg6_m, reg7_m; \
55 v4i32 max = __msa_ldi_w(255); \ 55 v4i32 max_m = __msa_ldi_w(255); \
56 v16i8 zero = {0}; \ 56 v16i8 zero_m = {0}; \
57 \ 57 \
58 vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \ 58 vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \
59 vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)in_uv); \ 59 vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv); \
60 reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0_m); \ 60 reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m); \
61 reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0_m); \ 61 reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m); \
62 reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1_m); \ 62 reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec1_m); \
63 reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1_m); \ 63 reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec1_m); \
64 reg0_m *= yg; \ 64 reg0_m *= yg; \
65 reg1_m *= yg; \ 65 reg1_m *= yg; \
66 reg2_m *= ubvr; \ 66 reg2_m *= ubvr; \
67 reg3_m *= ubvr; \ 67 reg3_m *= ubvr; \
68 reg0_m = __msa_srai_w(reg0_m, 16); \ 68 reg0_m = __msa_srai_w(reg0_m, 16); \
69 reg1_m = __msa_srai_w(reg1_m, 16); \ 69 reg1_m = __msa_srai_w(reg1_m, 16); \
70 reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \ 70 reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \
71 reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \ 71 reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \
72 reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \ 72 reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \
73 reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \ 73 reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \
(...skipping 17 matching lines...) Expand all
91 reg7_m = __msa_srai_w(reg7_m, 6); \ 91 reg7_m = __msa_srai_w(reg7_m, 6); \
92 reg4_m = __msa_srai_w(reg4_m, 6); \ 92 reg4_m = __msa_srai_w(reg4_m, 6); \
93 reg2_m = __msa_srai_w(reg2_m, 6); \ 93 reg2_m = __msa_srai_w(reg2_m, 6); \
94 reg3_m = __msa_srai_w(reg3_m, 6); \ 94 reg3_m = __msa_srai_w(reg3_m, 6); \
95 reg5_m = __msa_maxi_s_w(reg5_m, 0); \ 95 reg5_m = __msa_maxi_s_w(reg5_m, 0); \
96 reg6_m = __msa_maxi_s_w(reg6_m, 0); \ 96 reg6_m = __msa_maxi_s_w(reg6_m, 0); \
97 reg7_m = __msa_maxi_s_w(reg7_m, 0); \ 97 reg7_m = __msa_maxi_s_w(reg7_m, 0); \
98 reg4_m = __msa_maxi_s_w(reg4_m, 0); \ 98 reg4_m = __msa_maxi_s_w(reg4_m, 0); \
99 reg2_m = __msa_maxi_s_w(reg2_m, 0); \ 99 reg2_m = __msa_maxi_s_w(reg2_m, 0); \
100 reg3_m = __msa_maxi_s_w(reg3_m, 0); \ 100 reg3_m = __msa_maxi_s_w(reg3_m, 0); \
101 reg5_m = __msa_min_s_w(max, reg5_m); \ 101 reg5_m = __msa_min_s_w(max_m, reg5_m); \
102 reg6_m = __msa_min_s_w(max, reg6_m); \ 102 reg6_m = __msa_min_s_w(max_m, reg6_m); \
103 reg7_m = __msa_min_s_w(max, reg7_m); \ 103 reg7_m = __msa_min_s_w(max_m, reg7_m); \
104 reg4_m = __msa_min_s_w(max, reg4_m); \ 104 reg4_m = __msa_min_s_w(max_m, reg4_m); \
105 reg2_m = __msa_min_s_w(max, reg2_m); \ 105 reg2_m = __msa_min_s_w(max_m, reg2_m); \
106 reg3_m = __msa_min_s_w(max, reg3_m); \ 106 reg3_m = __msa_min_s_w(max_m, reg3_m); \
107 out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \ 107 out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \
108 out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \ 108 out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \
109 out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ 109 out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \
110 } 110 }
111 111
112 // Pack and Store 8 ARGB values. 112 // Pack and Store 8 ARGB values.
113 #define STOREARGB(in0, in1, in2, in3, pdst_argb) \ 113 #define STOREARGB(in0, in1, in2, in3, pdst_argb) \
114 { \ 114 { \
115 v8i16 vec0_m, vec1_m; \ 115 v8i16 vec0_m, vec1_m; \
116 v16u8 dst0_m, dst1_m; \ 116 v16u8 dst0_m, dst1_m; \
117 vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ 117 vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
118 vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ 118 vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
119 dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m); \ 119 dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m); \
120 dst1_m = (v16u8)__msa_ilvl_h(vec1_m, vec0_m); \ 120 dst1_m = (v16u8)__msa_ilvl_h(vec1_m, vec0_m); \
121 ST_UB2(dst0_m, dst1_m, pdst_argb, 16); \ 121 ST_UB2(dst0_m, dst1_m, pdst_argb, 16); \
122 } 122 }
123 123
124 // Takes ARGB input and calculates Y.
125 #define ARGBTOY(argb0, argb1, argb2, argb3, const0, const1, const2, shift, \
126 y_out) \
127 { \
128 v16u8 vec0_m, vec1_m, vec2_m, vec3_m; \
129 v8u16 reg0_m, reg1_m; \
130 \
131 vec0_m = (v16u8)__msa_pckev_h((v8i16)argb1, (v8i16)argb0); \
132 vec1_m = (v16u8)__msa_pckev_h((v8i16)argb3, (v8i16)argb2); \
133 vec2_m = (v16u8)__msa_pckod_h((v8i16)argb1, (v8i16)argb0); \
134 vec3_m = (v16u8)__msa_pckod_h((v8i16)argb3, (v8i16)argb2); \
135 reg0_m = __msa_dotp_u_h(vec0_m, const0); \
136 reg1_m = __msa_dotp_u_h(vec1_m, const0); \
137 reg0_m = __msa_dpadd_u_h(reg0_m, vec2_m, const1); \
138 reg1_m = __msa_dpadd_u_h(reg1_m, vec3_m, const1); \
139 reg0_m += const2; \
140 reg1_m += const2; \
141 reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, shift); \
142 reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, shift); \
143 y_out = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \
144 }
145
146 // Laods current and next row of ARGB input and averages it to calculate U and V
fbarchard1 2017/01/24 00:15:47 Loads. Not laods
manojkumar.bhosale 2017/01/25 08:35:42 Done. Sorry for typo.
147 #define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3) \
148 { \
149 v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \
150 v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
151 v16u8 vec8_m, vec9_m; \
152 v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \
153 v8u16 reg8_m, reg9_m; \
154 \
155 src0_m = (v16u8)__msa_ld_b((v16i8*)s, 0); \
156 src1_m = (v16u8)__msa_ld_b((v16i8*)s, 16); \
157 src2_m = (v16u8)__msa_ld_b((v16i8*)s, 32); \
158 src3_m = (v16u8)__msa_ld_b((v16i8*)s, 48); \
159 src4_m = (v16u8)__msa_ld_b((v16i8*)t, 0); \
160 src5_m = (v16u8)__msa_ld_b((v16i8*)t, 16); \
161 src6_m = (v16u8)__msa_ld_b((v16i8*)t, 32); \
162 src7_m = (v16u8)__msa_ld_b((v16i8*)t, 48); \
163 vec0_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \
164 vec1_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \
165 vec2_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \
166 vec3_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \
167 vec4_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \
168 vec5_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \
169 vec6_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \
170 vec7_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \
171 reg0_m = __msa_hadd_u_h(vec0_m, vec0_m); \
172 reg1_m = __msa_hadd_u_h(vec1_m, vec1_m); \
173 reg2_m = __msa_hadd_u_h(vec2_m, vec2_m); \
174 reg3_m = __msa_hadd_u_h(vec3_m, vec3_m); \
175 reg4_m = __msa_hadd_u_h(vec4_m, vec4_m); \
176 reg5_m = __msa_hadd_u_h(vec5_m, vec5_m); \
177 reg6_m = __msa_hadd_u_h(vec6_m, vec6_m); \
178 reg7_m = __msa_hadd_u_h(vec7_m, vec7_m); \
179 reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \
180 reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \
181 reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \
182 reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \
183 reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \
184 reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \
185 reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \
186 reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \
187 reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \
188 reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \
189 reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \
190 reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \
191 argb0 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \
192 argb1 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \
193 src0_m = (v16u8)__msa_ld_b((v16i8*)s, 64); \
194 src1_m = (v16u8)__msa_ld_b((v16i8*)s, 80); \
195 src2_m = (v16u8)__msa_ld_b((v16i8*)s, 96); \
196 src3_m = (v16u8)__msa_ld_b((v16i8*)s, 112); \
197 src4_m = (v16u8)__msa_ld_b((v16i8*)t, 64); \
198 src5_m = (v16u8)__msa_ld_b((v16i8*)t, 80); \
199 src6_m = (v16u8)__msa_ld_b((v16i8*)t, 96); \
200 src7_m = (v16u8)__msa_ld_b((v16i8*)t, 112); \
201 vec2_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \
202 vec3_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \
203 vec4_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \
204 vec5_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \
205 vec6_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \
206 vec7_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \
207 vec8_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \
208 vec9_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \
209 reg0_m = __msa_hadd_u_h(vec2_m, vec2_m); \
210 reg1_m = __msa_hadd_u_h(vec3_m, vec3_m); \
211 reg2_m = __msa_hadd_u_h(vec4_m, vec4_m); \
212 reg3_m = __msa_hadd_u_h(vec5_m, vec5_m); \
213 reg4_m = __msa_hadd_u_h(vec6_m, vec6_m); \
214 reg5_m = __msa_hadd_u_h(vec7_m, vec7_m); \
215 reg6_m = __msa_hadd_u_h(vec8_m, vec8_m); \
216 reg7_m = __msa_hadd_u_h(vec9_m, vec9_m); \
217 reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \
218 reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \
219 reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \
220 reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \
221 reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \
222 reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \
223 reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \
224 reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \
225 reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \
226 reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \
227 reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \
228 reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \
229 argb2 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \
230 argb3 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \
231 }
232
233 // Takes ARGB input and calculates U and V.
234 #define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \
235 shf0, shf1, shf2, shf3, v_out, u_out) \
236 { \
237 v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
238 v8u16 reg0_m, reg1_m, reg2_m, reg3_m; \
239 \
240 vec0_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb1, (v16i8)argb0); \
241 vec1_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb3, (v16i8)argb2); \
242 vec2_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb1, (v16i8)argb0); \
243 vec3_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb3, (v16i8)argb2); \
244 vec4_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb1, (v16i8)argb0); \
245 vec5_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb3, (v16i8)argb2); \
246 vec6_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb1, (v16i8)argb0); \
247 vec7_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb3, (v16i8)argb2); \
248 reg0_m = __msa_dotp_u_h(vec0_m, const1); \
249 reg1_m = __msa_dotp_u_h(vec1_m, const1); \
250 reg2_m = __msa_dotp_u_h(vec4_m, const1); \
251 reg3_m = __msa_dotp_u_h(vec5_m, const1); \
252 reg0_m += const3; \
253 reg1_m += const3; \
254 reg2_m += const3; \
255 reg3_m += const3; \
256 reg0_m -= __msa_dotp_u_h(vec2_m, const0); \
257 reg1_m -= __msa_dotp_u_h(vec3_m, const0); \
258 reg2_m -= __msa_dotp_u_h(vec6_m, const2); \
259 reg3_m -= __msa_dotp_u_h(vec7_m, const2); \
260 v_out = (v16u8)__msa_pckod_b((v16i8)reg1_m, (v16i8)reg0_m); \
261 u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m); \
262 }
263
124 void MirrorRow_MSA(const uint8* src, uint8* dst, int width) { 264 void MirrorRow_MSA(const uint8* src, uint8* dst, int width) {
125 int x; 265 int x;
126 v16u8 src0, src1, src2, src3; 266 v16u8 src0, src1, src2, src3;
127 v16u8 dst0, dst1, dst2, dst3; 267 v16u8 dst0, dst1, dst2, dst3;
128 v16i8 shuffler = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; 268 v16i8 shuffler = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
129 src += width - 64; 269 src += width - 64;
130 270
131 for (x = 0; x < width; x += 64) { 271 for (x = 0; x < width; x += 64) {
132 LD_UB4(src, 16, src3, src2, src1, src0); 272 LD_UB4(src, 16, src3, src2, src1, src0);
133 VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2); 273 VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);
(...skipping 2099 matching lines...) Expand 10 before | Expand all | Expand 10 after
2233 dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1); 2373 dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1);
2234 dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2); 2374 dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2);
2235 dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2); 2375 dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2);
2236 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); 2376 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
2237 src_sobelx += 16; 2377 src_sobelx += 16;
2238 src_sobely += 16; 2378 src_sobely += 16;
2239 dst_argb += 64; 2379 dst_argb += 64;
2240 } 2380 }
2241 } 2381 }
2242 2382
2383 void ARGBToYJRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
2384 int x;
2385 v16u8 src0, src1, src2, src3, dst0;
2386 v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
2387 v16u8 const_0x26 = (v16u8)__msa_fill_h(0x26);
2388 v8u16 const_0x40 = (v8u16)__msa_fill_h(0x40);
2389
2390 for (x = 0; x < width; x += 16) {
2391 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
2392 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
2393 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
2394 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
2395 ARGBTOY(src0, src1, src2, src3, const_0x4B0F, const_0x26, const_0x40, 7,
2396 dst0);
2397 ST_UB(dst0, dst_y);
2398 src_argb0 += 64;
2399 dst_y += 16;
2400 }
2401 }
2402
2403 void BGRAToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
2404 int x;
2405 v16u8 src0, src1, src2, src3, dst0;
2406 v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200);
2407 v16u8 const_0x1981 = (v16u8)__msa_fill_h(0x1981);
2408 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
2409
2410 for (x = 0; x < width; x += 16) {
2411 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
2412 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
2413 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
2414 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
2415 ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8,
2416 dst0);
2417 ST_UB(dst0, dst_y);
2418 src_argb0 += 64;
2419 dst_y += 16;
2420 }
2421 }
2422
2423 void ABGRToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
2424 int x;
2425 v16u8 src0, src1, src2, src3, dst0;
2426 v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142);
2427 v16u8 const_0x19 = (v16u8)__msa_fill_h(0x19);
2428 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
2429
2430 for (x = 0; x < width; x += 16) {
2431 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
2432 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
2433 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
2434 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
2435 ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8,
2436 dst0);
2437 ST_UB(dst0, dst_y);
2438 src_argb0 += 64;
2439 dst_y += 16;
2440 }
2441 }
2442
2443 void RGBAToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
2444 int x;
2445 v16u8 src0, src1, src2, src3, dst0;
2446 v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900);
2447 v16u8 const_0x4281 = (v16u8)__msa_fill_h(0x4281);
2448 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
2449
2450 for (x = 0; x < width; x += 16) {
2451 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
2452 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
2453 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
2454 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
2455 ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8,
2456 dst0);
2457 ST_UB(dst0, dst_y);
2458 src_argb0 += 64;
2459 dst_y += 16;
2460 }
2461 }
2462
2463 void ARGBToUVJRow_MSA(const uint8* src_rgb0,
2464 int src_stride_rgb,
2465 uint8* dst_u,
2466 uint8* dst_v,
2467 int width) {
2468 int x;
2469 const uint8* s = src_rgb0;
2470 const uint8* t = src_rgb0 + src_stride_rgb;
2471 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2472 v16u8 vec0, vec1, vec2, vec3;
2473 v16u8 dst0, dst1;
2474 v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
2475 v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
2476 18, 19, 22, 23, 26, 27, 30, 31};
2477 v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
2478 v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
2479 v16u8 const_0x7F = (v16u8)__msa_fill_h(0x7F);
2480 v16u8 const_0x6B14 = (v16u8)__msa_fill_h(0x6B14);
2481 v16u8 const_0x2B54 = (v16u8)__msa_fill_h(0x2B54);
2482 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
2483
2484 for (x = 0; x < width; x += 32) {
fbarchard1 2017/01/24 00:15:47 can you make this function similar to others, doin
manojkumar.bhosale 2017/01/25 08:35:41 Actually sending a patchset to fix an issue in oth
2485 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
2486 src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
2487 src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
2488 src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
2489 src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
2490 src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
2491 src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
2492 src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
2493 src0 = __msa_aver_u_b(src0, src4);
2494 src1 = __msa_aver_u_b(src1, src5);
2495 src2 = __msa_aver_u_b(src2, src6);
2496 src3 = __msa_aver_u_b(src3, src7);
2497 src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
2498 src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
2499 src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
2500 src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
2501 vec0 = __msa_aver_u_b(src4, src6);
2502 vec1 = __msa_aver_u_b(src5, src7);
2503 src0 = (v16u8)__msa_ld_b((v16i8*)s, 64);
2504 src1 = (v16u8)__msa_ld_b((v16i8*)s, 80);
2505 src2 = (v16u8)__msa_ld_b((v16i8*)s, 96);
2506 src3 = (v16u8)__msa_ld_b((v16i8*)s, 112);
2507 src4 = (v16u8)__msa_ld_b((v16i8*)t, 64);
2508 src5 = (v16u8)__msa_ld_b((v16i8*)t, 80);
2509 src6 = (v16u8)__msa_ld_b((v16i8*)t, 96);
2510 src7 = (v16u8)__msa_ld_b((v16i8*)t, 112);
2511 src0 = __msa_aver_u_b(src0, src4);
2512 src1 = __msa_aver_u_b(src1, src5);
2513 src2 = __msa_aver_u_b(src2, src6);
2514 src3 = __msa_aver_u_b(src3, src7);
2515 src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
2516 src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
2517 src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
2518 src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
2519 vec2 = __msa_aver_u_b(src4, src6);
2520 vec3 = __msa_aver_u_b(src5, src7);
2521 ARGBTOUV(vec0, vec1, vec2, vec3, const_0x6B14, const_0x7F, const_0x2B54,
2522 const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
2523 dst1);
2524 ST_UB(dst0, dst_v);
2525 ST_UB(dst1, dst_u);
2526 s += 128;
2527 t += 128;
2528 dst_v += 16;
2529 dst_u += 16;
2530 }
2531 }
2532
2533 void BGRAToUVRow_MSA(const uint8* src_rgb0,
2534 int src_stride_rgb,
2535 uint8* dst_u,
2536 uint8* dst_v,
2537 int width) {
2538 int x;
2539 const uint8* s = src_rgb0;
2540 const uint8* t = src_rgb0 + src_stride_rgb;
2541 v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
2542 v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
2543 v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
2544 18, 19, 22, 23, 26, 27, 30, 31};
2545 v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
2546 v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
2547 v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
2548 v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
2549 v16u8 const_0x264A = (v16u8)__msa_fill_h(0x264A);
2550 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
2551
2552 for (x = 0; x < width; x += 16) {
2553 READ_ARGB(s, t, vec0, vec1, vec2, vec3);
2554 ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
2555 const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
2556 dst1);
2557 ST_UB(dst0, dst_v);
2558 ST_UB(dst1, dst_u);
2559 s += 128;
2560 t += 128;
2561 dst_v += 16;
2562 dst_u += 16;
2563 }
2564 }
2565
2566 void ABGRToUVRow_MSA(const uint8* src_rgb0,
2567 int src_stride_rgb,
2568 uint8* dst_u,
2569 uint8* dst_v,
2570 int width) {
2571 int x;
2572 const uint8* s = src_rgb0;
2573 const uint8* t = src_rgb0 + src_stride_rgb;
2574 v16u8 src0, src1, src2, src3;
2575 v16u8 dst0, dst1;
2576 v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
2577 v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
2578 18, 19, 22, 23, 26, 27, 30, 31};
2579 v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
2580 v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
2581 v16u8 const_0x4A26 = (v16u8)__msa_fill_h(0x4A26);
2582 v16u8 const_0x0070 = (v16u8)__msa_fill_h(0x0070);
2583 v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
2584 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
2585
2586 for (x = 0; x < width; x += 16) {
2587 READ_ARGB(s, t, src0, src1, src2, src3);
2588 ARGBTOUV(src0, src1, src2, src3, const_0x4A26, const_0x0070, const_0x125E,
2589 const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
2590 dst1);
2591 ST_UB(dst0, dst_u);
2592 ST_UB(dst1, dst_v);
2593 s += 128;
2594 t += 128;
2595 dst_u += 16;
2596 dst_v += 16;
2597 }
2598 }
2599
2600 void RGBAToUVRow_MSA(const uint8* src_rgb0,
2601 int src_stride_rgb,
2602 uint8* dst_u,
2603 uint8* dst_v,
2604 int width) {
2605 int x;
2606 const uint8* s = src_rgb0;
2607 const uint8* t = src_rgb0 + src_stride_rgb;
2608 v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
2609 v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
2610 v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
2611 18, 19, 22, 23, 26, 27, 30, 31};
2612 v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
2613 v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
2614 v16u8 const_0x125E = (v16u8)__msa_fill_h(0x264A);
2615 v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
2616 v16u8 const_0x264A = (v16u8)__msa_fill_h(0x125E);
2617 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
2618
2619 for (x = 0; x < width; x += 16) {
2620 READ_ARGB(s, t, vec0, vec1, vec2, vec3);
2621 ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
2622 const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
2623 dst1);
2624 ST_UB(dst0, dst_u);
2625 ST_UB(dst1, dst_v);
2626 s += 128;
2627 t += 128;
2628 dst_u += 16;
2629 dst_v += 16;
2630 }
2631 }
2632
2243 #ifdef __cplusplus 2633 #ifdef __cplusplus
2244 } // extern "C" 2634 } // extern "C"
2245 } // namespace libyuv 2635 } // namespace libyuv
2246 #endif 2636 #endif
2247 2637
2248 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) 2638 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
OLDNEW
« no previous file with comments | « source/row_any.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698