Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 /* | 1 /* |
| 2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 45 out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32)u_m); \ | 45 out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32)u_m); \ |
| 46 out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32)v_m); \ | 46 out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32)v_m); \ |
| 47 } | 47 } |
| 48 | 48 |
| 49 // Convert 8 pixels of YUV 420 to RGB. | 49 // Convert 8 pixels of YUV 420 to RGB. |
| 50 #define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \ | 50 #define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \ |
| 51 { \ | 51 { \ |
| 52 v8i16 vec0_m, vec1_m; \ | 52 v8i16 vec0_m, vec1_m; \ |
| 53 v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ | 53 v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ |
| 54 v4i32 reg5_m, reg6_m, reg7_m; \ | 54 v4i32 reg5_m, reg6_m, reg7_m; \ |
| 55 v4i32 max = __msa_ldi_w(255); \ | 55 v4i32 max_m = __msa_ldi_w(255); \ |
| 56 v16i8 zero = {0}; \ | 56 v16i8 zero_m = {0}; \ |
| 57 \ | 57 \ |
| 58 vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \ | 58 vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \ |
| 59 vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)in_uv); \ | 59 vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv); \ |
| 60 reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0_m); \ | 60 reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m); \ |
| 61 reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0_m); \ | 61 reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m); \ |
| 62 reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1_m); \ | 62 reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec1_m); \ |
| 63 reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1_m); \ | 63 reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec1_m); \ |
| 64 reg0_m *= yg; \ | 64 reg0_m *= yg; \ |
| 65 reg1_m *= yg; \ | 65 reg1_m *= yg; \ |
| 66 reg2_m *= ubvr; \ | 66 reg2_m *= ubvr; \ |
| 67 reg3_m *= ubvr; \ | 67 reg3_m *= ubvr; \ |
| 68 reg0_m = __msa_srai_w(reg0_m, 16); \ | 68 reg0_m = __msa_srai_w(reg0_m, 16); \ |
| 69 reg1_m = __msa_srai_w(reg1_m, 16); \ | 69 reg1_m = __msa_srai_w(reg1_m, 16); \ |
| 70 reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \ | 70 reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \ |
| 71 reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \ | 71 reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \ |
| 72 reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \ | 72 reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \ |
| 73 reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \ | 73 reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \ |
| (...skipping 17 matching lines...) Expand all Loading... | |
| 91 reg7_m = __msa_srai_w(reg7_m, 6); \ | 91 reg7_m = __msa_srai_w(reg7_m, 6); \ |
| 92 reg4_m = __msa_srai_w(reg4_m, 6); \ | 92 reg4_m = __msa_srai_w(reg4_m, 6); \ |
| 93 reg2_m = __msa_srai_w(reg2_m, 6); \ | 93 reg2_m = __msa_srai_w(reg2_m, 6); \ |
| 94 reg3_m = __msa_srai_w(reg3_m, 6); \ | 94 reg3_m = __msa_srai_w(reg3_m, 6); \ |
| 95 reg5_m = __msa_maxi_s_w(reg5_m, 0); \ | 95 reg5_m = __msa_maxi_s_w(reg5_m, 0); \ |
| 96 reg6_m = __msa_maxi_s_w(reg6_m, 0); \ | 96 reg6_m = __msa_maxi_s_w(reg6_m, 0); \ |
| 97 reg7_m = __msa_maxi_s_w(reg7_m, 0); \ | 97 reg7_m = __msa_maxi_s_w(reg7_m, 0); \ |
| 98 reg4_m = __msa_maxi_s_w(reg4_m, 0); \ | 98 reg4_m = __msa_maxi_s_w(reg4_m, 0); \ |
| 99 reg2_m = __msa_maxi_s_w(reg2_m, 0); \ | 99 reg2_m = __msa_maxi_s_w(reg2_m, 0); \ |
| 100 reg3_m = __msa_maxi_s_w(reg3_m, 0); \ | 100 reg3_m = __msa_maxi_s_w(reg3_m, 0); \ |
| 101 reg5_m = __msa_min_s_w(max, reg5_m); \ | 101 reg5_m = __msa_min_s_w(max_m, reg5_m); \ |
| 102 reg6_m = __msa_min_s_w(max, reg6_m); \ | 102 reg6_m = __msa_min_s_w(max_m, reg6_m); \ |
| 103 reg7_m = __msa_min_s_w(max, reg7_m); \ | 103 reg7_m = __msa_min_s_w(max_m, reg7_m); \ |
| 104 reg4_m = __msa_min_s_w(max, reg4_m); \ | 104 reg4_m = __msa_min_s_w(max_m, reg4_m); \ |
| 105 reg2_m = __msa_min_s_w(max, reg2_m); \ | 105 reg2_m = __msa_min_s_w(max_m, reg2_m); \ |
| 106 reg3_m = __msa_min_s_w(max, reg3_m); \ | 106 reg3_m = __msa_min_s_w(max_m, reg3_m); \ |
| 107 out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \ | 107 out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \ |
| 108 out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \ | 108 out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \ |
| 109 out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ | 109 out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ |
| 110 } | 110 } |
| 111 | 111 |
| 112 // Pack and Store 8 ARGB values. | 112 // Pack and Store 8 ARGB values. |
| 113 #define STOREARGB(in0, in1, in2, in3, pdst_argb) \ | 113 #define STOREARGB(in0, in1, in2, in3, pdst_argb) \ |
| 114 { \ | 114 { \ |
| 115 v8i16 vec0_m, vec1_m; \ | 115 v8i16 vec0_m, vec1_m; \ |
| 116 v16u8 dst0_m, dst1_m; \ | 116 v16u8 dst0_m, dst1_m; \ |
| 117 vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ | 117 vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ |
| 118 vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ | 118 vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ |
| 119 dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m); \ | 119 dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m); \ |
| 120 dst1_m = (v16u8)__msa_ilvl_h(vec1_m, vec0_m); \ | 120 dst1_m = (v16u8)__msa_ilvl_h(vec1_m, vec0_m); \ |
| 121 ST_UB2(dst0_m, dst1_m, pdst_argb, 16); \ | 121 ST_UB2(dst0_m, dst1_m, pdst_argb, 16); \ |
| 122 } | 122 } |
| 123 | 123 |
| 124 // Takes ARGB input and calculates Y. | |
| 125 #define ARGBTOY(argb0, argb1, argb2, argb3, const0, const1, const2, shift, \ | |
| 126 y_out) \ | |
| 127 { \ | |
| 128 v16u8 vec0_m, vec1_m, vec2_m, vec3_m; \ | |
| 129 v8u16 reg0_m, reg1_m; \ | |
| 130 \ | |
| 131 vec0_m = (v16u8)__msa_pckev_h((v8i16)argb1, (v8i16)argb0); \ | |
| 132 vec1_m = (v16u8)__msa_pckev_h((v8i16)argb3, (v8i16)argb2); \ | |
| 133 vec2_m = (v16u8)__msa_pckod_h((v8i16)argb1, (v8i16)argb0); \ | |
| 134 vec3_m = (v16u8)__msa_pckod_h((v8i16)argb3, (v8i16)argb2); \ | |
| 135 reg0_m = __msa_dotp_u_h(vec0_m, const0); \ | |
| 136 reg1_m = __msa_dotp_u_h(vec1_m, const0); \ | |
| 137 reg0_m = __msa_dpadd_u_h(reg0_m, vec2_m, const1); \ | |
| 138 reg1_m = __msa_dpadd_u_h(reg1_m, vec3_m, const1); \ | |
| 139 reg0_m += const2; \ | |
| 140 reg1_m += const2; \ | |
| 141 reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, shift); \ | |
| 142 reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, shift); \ | |
| 143 y_out = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \ | |
| 144 } | |
| 145 | |
| 146 // Laods current and next row of ARGB input and averages it to calculate U and V | |
|
fbarchard1
2017/01/24 00:15:47
Loads. Not laods
manojkumar.bhosale
2017/01/25 08:35:42
Done. Sorry for typo.
| |
| 147 #define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3) \ | |
| 148 { \ | |
| 149 v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \ | |
| 150 v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ | |
| 151 v16u8 vec8_m, vec9_m; \ | |
| 152 v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \ | |
| 153 v8u16 reg8_m, reg9_m; \ | |
| 154 \ | |
| 155 src0_m = (v16u8)__msa_ld_b((v16i8*)s, 0); \ | |
| 156 src1_m = (v16u8)__msa_ld_b((v16i8*)s, 16); \ | |
| 157 src2_m = (v16u8)__msa_ld_b((v16i8*)s, 32); \ | |
| 158 src3_m = (v16u8)__msa_ld_b((v16i8*)s, 48); \ | |
| 159 src4_m = (v16u8)__msa_ld_b((v16i8*)t, 0); \ | |
| 160 src5_m = (v16u8)__msa_ld_b((v16i8*)t, 16); \ | |
| 161 src6_m = (v16u8)__msa_ld_b((v16i8*)t, 32); \ | |
| 162 src7_m = (v16u8)__msa_ld_b((v16i8*)t, 48); \ | |
| 163 vec0_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \ | |
| 164 vec1_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \ | |
| 165 vec2_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \ | |
| 166 vec3_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \ | |
| 167 vec4_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \ | |
| 168 vec5_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \ | |
| 169 vec6_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \ | |
| 170 vec7_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \ | |
| 171 reg0_m = __msa_hadd_u_h(vec0_m, vec0_m); \ | |
| 172 reg1_m = __msa_hadd_u_h(vec1_m, vec1_m); \ | |
| 173 reg2_m = __msa_hadd_u_h(vec2_m, vec2_m); \ | |
| 174 reg3_m = __msa_hadd_u_h(vec3_m, vec3_m); \ | |
| 175 reg4_m = __msa_hadd_u_h(vec4_m, vec4_m); \ | |
| 176 reg5_m = __msa_hadd_u_h(vec5_m, vec5_m); \ | |
| 177 reg6_m = __msa_hadd_u_h(vec6_m, vec6_m); \ | |
| 178 reg7_m = __msa_hadd_u_h(vec7_m, vec7_m); \ | |
| 179 reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \ | |
| 180 reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \ | |
| 181 reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \ | |
| 182 reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \ | |
| 183 reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \ | |
| 184 reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \ | |
| 185 reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \ | |
| 186 reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \ | |
| 187 reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \ | |
| 188 reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \ | |
| 189 reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \ | |
| 190 reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \ | |
| 191 argb0 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \ | |
| 192 argb1 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \ | |
| 193 src0_m = (v16u8)__msa_ld_b((v16i8*)s, 64); \ | |
| 194 src1_m = (v16u8)__msa_ld_b((v16i8*)s, 80); \ | |
| 195 src2_m = (v16u8)__msa_ld_b((v16i8*)s, 96); \ | |
| 196 src3_m = (v16u8)__msa_ld_b((v16i8*)s, 112); \ | |
| 197 src4_m = (v16u8)__msa_ld_b((v16i8*)t, 64); \ | |
| 198 src5_m = (v16u8)__msa_ld_b((v16i8*)t, 80); \ | |
| 199 src6_m = (v16u8)__msa_ld_b((v16i8*)t, 96); \ | |
| 200 src7_m = (v16u8)__msa_ld_b((v16i8*)t, 112); \ | |
| 201 vec2_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \ | |
| 202 vec3_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \ | |
| 203 vec4_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \ | |
| 204 vec5_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \ | |
| 205 vec6_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \ | |
| 206 vec7_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \ | |
| 207 vec8_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \ | |
| 208 vec9_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \ | |
| 209 reg0_m = __msa_hadd_u_h(vec2_m, vec2_m); \ | |
| 210 reg1_m = __msa_hadd_u_h(vec3_m, vec3_m); \ | |
| 211 reg2_m = __msa_hadd_u_h(vec4_m, vec4_m); \ | |
| 212 reg3_m = __msa_hadd_u_h(vec5_m, vec5_m); \ | |
| 213 reg4_m = __msa_hadd_u_h(vec6_m, vec6_m); \ | |
| 214 reg5_m = __msa_hadd_u_h(vec7_m, vec7_m); \ | |
| 215 reg6_m = __msa_hadd_u_h(vec8_m, vec8_m); \ | |
| 216 reg7_m = __msa_hadd_u_h(vec9_m, vec9_m); \ | |
| 217 reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \ | |
| 218 reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \ | |
| 219 reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \ | |
| 220 reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \ | |
| 221 reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \ | |
| 222 reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \ | |
| 223 reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \ | |
| 224 reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \ | |
| 225 reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \ | |
| 226 reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \ | |
| 227 reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \ | |
| 228 reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \ | |
| 229 argb2 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \ | |
| 230 argb3 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \ | |
| 231 } | |
| 232 | |
| 233 // Takes ARGB input and calculates U and V. | |
| 234 #define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \ | |
| 235 shf0, shf1, shf2, shf3, v_out, u_out) \ | |
| 236 { \ | |
| 237 v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ | |
| 238 v8u16 reg0_m, reg1_m, reg2_m, reg3_m; \ | |
| 239 \ | |
| 240 vec0_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb1, (v16i8)argb0); \ | |
| 241 vec1_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb3, (v16i8)argb2); \ | |
| 242 vec2_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb1, (v16i8)argb0); \ | |
| 243 vec3_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb3, (v16i8)argb2); \ | |
| 244 vec4_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb1, (v16i8)argb0); \ | |
| 245 vec5_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb3, (v16i8)argb2); \ | |
| 246 vec6_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb1, (v16i8)argb0); \ | |
| 247 vec7_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb3, (v16i8)argb2); \ | |
| 248 reg0_m = __msa_dotp_u_h(vec0_m, const1); \ | |
| 249 reg1_m = __msa_dotp_u_h(vec1_m, const1); \ | |
| 250 reg2_m = __msa_dotp_u_h(vec4_m, const1); \ | |
| 251 reg3_m = __msa_dotp_u_h(vec5_m, const1); \ | |
| 252 reg0_m += const3; \ | |
| 253 reg1_m += const3; \ | |
| 254 reg2_m += const3; \ | |
| 255 reg3_m += const3; \ | |
| 256 reg0_m -= __msa_dotp_u_h(vec2_m, const0); \ | |
| 257 reg1_m -= __msa_dotp_u_h(vec3_m, const0); \ | |
| 258 reg2_m -= __msa_dotp_u_h(vec6_m, const2); \ | |
| 259 reg3_m -= __msa_dotp_u_h(vec7_m, const2); \ | |
| 260 v_out = (v16u8)__msa_pckod_b((v16i8)reg1_m, (v16i8)reg0_m); \ | |
| 261 u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m); \ | |
| 262 } | |
| 263 | |
| 124 void MirrorRow_MSA(const uint8* src, uint8* dst, int width) { | 264 void MirrorRow_MSA(const uint8* src, uint8* dst, int width) { |
| 125 int x; | 265 int x; |
| 126 v16u8 src0, src1, src2, src3; | 266 v16u8 src0, src1, src2, src3; |
| 127 v16u8 dst0, dst1, dst2, dst3; | 267 v16u8 dst0, dst1, dst2, dst3; |
| 128 v16i8 shuffler = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; | 268 v16i8 shuffler = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; |
| 129 src += width - 64; | 269 src += width - 64; |
| 130 | 270 |
| 131 for (x = 0; x < width; x += 64) { | 271 for (x = 0; x < width; x += 64) { |
| 132 LD_UB4(src, 16, src3, src2, src1, src0); | 272 LD_UB4(src, 16, src3, src2, src1, src0); |
| 133 VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2); | 273 VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2); |
| (...skipping 2099 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 2233 dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1); | 2373 dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1); |
| 2234 dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2); | 2374 dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2); |
| 2235 dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2); | 2375 dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2); |
| 2236 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); | 2376 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); |
| 2237 src_sobelx += 16; | 2377 src_sobelx += 16; |
| 2238 src_sobely += 16; | 2378 src_sobely += 16; |
| 2239 dst_argb += 64; | 2379 dst_argb += 64; |
| 2240 } | 2380 } |
| 2241 } | 2381 } |
| 2242 | 2382 |
| 2383 void ARGBToYJRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { | |
| 2384 int x; | |
| 2385 v16u8 src0, src1, src2, src3, dst0; | |
| 2386 v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F); | |
| 2387 v16u8 const_0x26 = (v16u8)__msa_fill_h(0x26); | |
| 2388 v8u16 const_0x40 = (v8u16)__msa_fill_h(0x40); | |
| 2389 | |
| 2390 for (x = 0; x < width; x += 16) { | |
| 2391 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); | |
| 2392 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); | |
| 2393 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32); | |
| 2394 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48); | |
| 2395 ARGBTOY(src0, src1, src2, src3, const_0x4B0F, const_0x26, const_0x40, 7, | |
| 2396 dst0); | |
| 2397 ST_UB(dst0, dst_y); | |
| 2398 src_argb0 += 64; | |
| 2399 dst_y += 16; | |
| 2400 } | |
| 2401 } | |
| 2402 | |
| 2403 void BGRAToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { | |
| 2404 int x; | |
| 2405 v16u8 src0, src1, src2, src3, dst0; | |
| 2406 v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200); | |
| 2407 v16u8 const_0x1981 = (v16u8)__msa_fill_h(0x1981); | |
| 2408 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); | |
| 2409 | |
| 2410 for (x = 0; x < width; x += 16) { | |
| 2411 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); | |
| 2412 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); | |
| 2413 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32); | |
| 2414 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48); | |
| 2415 ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8, | |
| 2416 dst0); | |
| 2417 ST_UB(dst0, dst_y); | |
| 2418 src_argb0 += 64; | |
| 2419 dst_y += 16; | |
| 2420 } | |
| 2421 } | |
| 2422 | |
| 2423 void ABGRToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { | |
| 2424 int x; | |
| 2425 v16u8 src0, src1, src2, src3, dst0; | |
| 2426 v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142); | |
| 2427 v16u8 const_0x19 = (v16u8)__msa_fill_h(0x19); | |
| 2428 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); | |
| 2429 | |
| 2430 for (x = 0; x < width; x += 16) { | |
| 2431 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); | |
| 2432 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); | |
| 2433 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32); | |
| 2434 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48); | |
| 2435 ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8, | |
| 2436 dst0); | |
| 2437 ST_UB(dst0, dst_y); | |
| 2438 src_argb0 += 64; | |
| 2439 dst_y += 16; | |
| 2440 } | |
| 2441 } | |
| 2442 | |
| 2443 void RGBAToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { | |
| 2444 int x; | |
| 2445 v16u8 src0, src1, src2, src3, dst0; | |
| 2446 v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900); | |
| 2447 v16u8 const_0x4281 = (v16u8)__msa_fill_h(0x4281); | |
| 2448 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); | |
| 2449 | |
| 2450 for (x = 0; x < width; x += 16) { | |
| 2451 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); | |
| 2452 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); | |
| 2453 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32); | |
| 2454 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48); | |
| 2455 ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8, | |
| 2456 dst0); | |
| 2457 ST_UB(dst0, dst_y); | |
| 2458 src_argb0 += 64; | |
| 2459 dst_y += 16; | |
| 2460 } | |
| 2461 } | |
| 2462 | |
| 2463 void ARGBToUVJRow_MSA(const uint8* src_rgb0, | |
| 2464 int src_stride_rgb, | |
| 2465 uint8* dst_u, | |
| 2466 uint8* dst_v, | |
| 2467 int width) { | |
| 2468 int x; | |
| 2469 const uint8* s = src_rgb0; | |
| 2470 const uint8* t = src_rgb0 + src_stride_rgb; | |
| 2471 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; | |
| 2472 v16u8 vec0, vec1, vec2, vec3; | |
| 2473 v16u8 dst0, dst1; | |
| 2474 v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; | |
| 2475 v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, | |
| 2476 18, 19, 22, 23, 26, 27, 30, 31}; | |
| 2477 v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; | |
| 2478 v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30}; | |
| 2479 v16u8 const_0x7F = (v16u8)__msa_fill_h(0x7F); | |
| 2480 v16u8 const_0x6B14 = (v16u8)__msa_fill_h(0x6B14); | |
| 2481 v16u8 const_0x2B54 = (v16u8)__msa_fill_h(0x2B54); | |
| 2482 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); | |
| 2483 | |
| 2484 for (x = 0; x < width; x += 32) { | |
|
fbarchard1
2017/01/24 00:15:47
can you make this function similar to others, doin
manojkumar.bhosale
2017/01/25 08:35:41
Actually sending a patchset to fix an issue in oth
| |
| 2485 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); | |
| 2486 src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); | |
| 2487 src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); | |
| 2488 src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); | |
| 2489 src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); | |
| 2490 src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); | |
| 2491 src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); | |
| 2492 src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); | |
| 2493 src0 = __msa_aver_u_b(src0, src4); | |
| 2494 src1 = __msa_aver_u_b(src1, src5); | |
| 2495 src2 = __msa_aver_u_b(src2, src6); | |
| 2496 src3 = __msa_aver_u_b(src3, src7); | |
| 2497 src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0); | |
| 2498 src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2); | |
| 2499 src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); | |
| 2500 src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2); | |
| 2501 vec0 = __msa_aver_u_b(src4, src6); | |
| 2502 vec1 = __msa_aver_u_b(src5, src7); | |
| 2503 src0 = (v16u8)__msa_ld_b((v16i8*)s, 64); | |
| 2504 src1 = (v16u8)__msa_ld_b((v16i8*)s, 80); | |
| 2505 src2 = (v16u8)__msa_ld_b((v16i8*)s, 96); | |
| 2506 src3 = (v16u8)__msa_ld_b((v16i8*)s, 112); | |
| 2507 src4 = (v16u8)__msa_ld_b((v16i8*)t, 64); | |
| 2508 src5 = (v16u8)__msa_ld_b((v16i8*)t, 80); | |
| 2509 src6 = (v16u8)__msa_ld_b((v16i8*)t, 96); | |
| 2510 src7 = (v16u8)__msa_ld_b((v16i8*)t, 112); | |
| 2511 src0 = __msa_aver_u_b(src0, src4); | |
| 2512 src1 = __msa_aver_u_b(src1, src5); | |
| 2513 src2 = __msa_aver_u_b(src2, src6); | |
| 2514 src3 = __msa_aver_u_b(src3, src7); | |
| 2515 src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0); | |
| 2516 src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2); | |
| 2517 src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); | |
| 2518 src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2); | |
| 2519 vec2 = __msa_aver_u_b(src4, src6); | |
| 2520 vec3 = __msa_aver_u_b(src5, src7); | |
| 2521 ARGBTOUV(vec0, vec1, vec2, vec3, const_0x6B14, const_0x7F, const_0x2B54, | |
| 2522 const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0, | |
| 2523 dst1); | |
| 2524 ST_UB(dst0, dst_v); | |
| 2525 ST_UB(dst1, dst_u); | |
| 2526 s += 128; | |
| 2527 t += 128; | |
| 2528 dst_v += 16; | |
| 2529 dst_u += 16; | |
| 2530 } | |
| 2531 } | |
| 2532 | |
| 2533 void BGRAToUVRow_MSA(const uint8* src_rgb0, | |
| 2534 int src_stride_rgb, | |
| 2535 uint8* dst_u, | |
| 2536 uint8* dst_v, | |
| 2537 int width) { | |
| 2538 int x; | |
| 2539 const uint8* s = src_rgb0; | |
| 2540 const uint8* t = src_rgb0 + src_stride_rgb; | |
| 2541 v16u8 dst0, dst1, vec0, vec1, vec2, vec3; | |
| 2542 v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; | |
| 2543 v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, | |
| 2544 18, 19, 22, 23, 26, 27, 30, 31}; | |
| 2545 v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; | |
| 2546 v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29}; | |
| 2547 v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E); | |
| 2548 v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000); | |
| 2549 v16u8 const_0x264A = (v16u8)__msa_fill_h(0x264A); | |
| 2550 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); | |
| 2551 | |
| 2552 for (x = 0; x < width; x += 16) { | |
| 2553 READ_ARGB(s, t, vec0, vec1, vec2, vec3); | |
| 2554 ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A, | |
| 2555 const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0, | |
| 2556 dst1); | |
| 2557 ST_UB(dst0, dst_v); | |
| 2558 ST_UB(dst1, dst_u); | |
| 2559 s += 128; | |
| 2560 t += 128; | |
| 2561 dst_v += 16; | |
| 2562 dst_u += 16; | |
| 2563 } | |
| 2564 } | |
| 2565 | |
| 2566 void ABGRToUVRow_MSA(const uint8* src_rgb0, | |
| 2567 int src_stride_rgb, | |
| 2568 uint8* dst_u, | |
| 2569 uint8* dst_v, | |
| 2570 int width) { | |
| 2571 int x; | |
| 2572 const uint8* s = src_rgb0; | |
| 2573 const uint8* t = src_rgb0 + src_stride_rgb; | |
| 2574 v16u8 src0, src1, src2, src3; | |
| 2575 v16u8 dst0, dst1; | |
| 2576 v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; | |
| 2577 v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, | |
| 2578 18, 19, 22, 23, 26, 27, 30, 31}; | |
| 2579 v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; | |
| 2580 v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30}; | |
| 2581 v16u8 const_0x4A26 = (v16u8)__msa_fill_h(0x4A26); | |
| 2582 v16u8 const_0x0070 = (v16u8)__msa_fill_h(0x0070); | |
| 2583 v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E); | |
| 2584 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); | |
| 2585 | |
| 2586 for (x = 0; x < width; x += 16) { | |
| 2587 READ_ARGB(s, t, src0, src1, src2, src3); | |
| 2588 ARGBTOUV(src0, src1, src2, src3, const_0x4A26, const_0x0070, const_0x125E, | |
| 2589 const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0, | |
| 2590 dst1); | |
| 2591 ST_UB(dst0, dst_u); | |
| 2592 ST_UB(dst1, dst_v); | |
| 2593 s += 128; | |
| 2594 t += 128; | |
| 2595 dst_u += 16; | |
| 2596 dst_v += 16; | |
| 2597 } | |
| 2598 } | |
| 2599 | |
| 2600 void RGBAToUVRow_MSA(const uint8* src_rgb0, | |
| 2601 int src_stride_rgb, | |
| 2602 uint8* dst_u, | |
| 2603 uint8* dst_v, | |
| 2604 int width) { | |
| 2605 int x; | |
| 2606 const uint8* s = src_rgb0; | |
| 2607 const uint8* t = src_rgb0 + src_stride_rgb; | |
| 2608 v16u8 dst0, dst1, vec0, vec1, vec2, vec3; | |
| 2609 v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; | |
| 2610 v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, | |
| 2611 18, 19, 22, 23, 26, 27, 30, 31}; | |
| 2612 v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; | |
| 2613 v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29}; | |
| 2614 v16u8 const_0x125E = (v16u8)__msa_fill_h(0x264A); | |
| 2615 v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000); | |
| 2616 v16u8 const_0x264A = (v16u8)__msa_fill_h(0x125E); | |
| 2617 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); | |
| 2618 | |
| 2619 for (x = 0; x < width; x += 16) { | |
| 2620 READ_ARGB(s, t, vec0, vec1, vec2, vec3); | |
| 2621 ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A, | |
| 2622 const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0, | |
| 2623 dst1); | |
| 2624 ST_UB(dst0, dst_u); | |
| 2625 ST_UB(dst1, dst_v); | |
| 2626 s += 128; | |
| 2627 t += 128; | |
| 2628 dst_u += 16; | |
| 2629 dst_v += 16; | |
| 2630 } | |
| 2631 } | |
| 2632 | |
| 2243 #ifdef __cplusplus | 2633 #ifdef __cplusplus |
| 2244 } // extern "C" | 2634 } // extern "C" |
| 2245 } // namespace libyuv | 2635 } // namespace libyuv |
| 2246 #endif | 2636 #endif |
| 2247 | 2637 |
| 2248 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) | 2638 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) |
| OLD | NEW |