| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 29 matching lines...) Expand all Loading... |
| 40 v4i32 zero_m = {0}; \ | 40 v4i32 zero_m = {0}; \ |
| 41 y_m = LD(psrc_y); \ | 41 y_m = LD(psrc_y); \ |
| 42 u_m = LW(psrc_u); \ | 42 u_m = LW(psrc_u); \ |
| 43 v_m = LW(psrc_v); \ | 43 v_m = LW(psrc_v); \ |
| 44 out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64)y_m); \ | 44 out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64)y_m); \ |
| 45 out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32)u_m); \ | 45 out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32)u_m); \ |
| 46 out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32)v_m); \ | 46 out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32)v_m); \ |
| 47 } | 47 } |
| 48 | 48 |
| 49 // Convert 8 pixels of YUV 420 to RGB. | 49 // Convert 8 pixels of YUV 420 to RGB. |
| 50 #define YUVTORGB(in_y, in_u, in_v, ub, vr, ug, vg, bb, bg, br, yg, out_b, \ | 50 #define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \ |
| 51 out_g, out_r) \ | 51 { \ |
| 52 { \ | 52 v8i16 vec0_m, vec1_m; \ |
| 53 v8i16 vec0_m; \ | 53 v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ |
| 54 v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ | 54 v4i32 reg5_m, reg6_m, reg7_m; \ |
| 55 v4i32 reg5_m, reg6_m, reg7_m, reg8_m, reg9_m; \ | 55 v4i32 max = __msa_ldi_w(255); \ |
| 56 v4i32 max_val_m = __msa_ldi_w(255); \ | 56 v16i8 zero = {0}; \ |
| 57 v8i16 zero_m = {0}; \ | 57 \ |
| 58 \ | 58 vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \ |
| 59 in_u = (v16u8)__msa_ilvr_b((v16i8)in_u, (v16i8)in_u); \ | 59 vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)in_uv); \ |
| 60 in_v = (v16u8)__msa_ilvr_b((v16i8)in_v, (v16i8)in_v); \ | 60 reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0_m); \ |
| 61 vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \ | 61 reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0_m); \ |
| 62 reg0_m = (v4i32)__msa_ilvr_h(zero_m, vec0_m); \ | 62 reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1_m); \ |
| 63 reg1_m = (v4i32)__msa_ilvl_h(zero_m, vec0_m); \ | 63 reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1_m); \ |
| 64 reg0_m *= vec_yg; \ | 64 reg0_m *= yg; \ |
| 65 reg1_m *= vec_yg; \ | 65 reg1_m *= yg; \ |
| 66 reg0_m = __msa_srai_w(reg0_m, 16); \ | 66 reg2_m *= ubvr; \ |
| 67 reg1_m = __msa_srai_w(reg1_m, 16); \ | 67 reg3_m *= ubvr; \ |
| 68 reg4_m = reg0_m + br; \ | 68 reg0_m = __msa_srai_w(reg0_m, 16); \ |
| 69 reg5_m = reg1_m + br; \ | 69 reg1_m = __msa_srai_w(reg1_m, 16); \ |
| 70 reg2_m = reg0_m + bg; \ | 70 reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \ |
| 71 reg3_m = reg1_m + bg; \ | 71 reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \ |
| 72 reg0_m += bb; \ | 72 reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \ |
| 73 reg1_m += bb; \ | 73 reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \ |
| 74 vec0_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_u); \ | 74 reg2_m = __msa_ilvod_w(reg2_m, reg2_m); \ |
| 75 reg6_m = (v4i32)__msa_ilvr_h(zero_m, (v8i16)vec0_m); \ | 75 reg3_m = __msa_ilvod_w(reg3_m, reg3_m); \ |
| 76 reg7_m = (v4i32)__msa_ilvl_h(zero_m, (v8i16)vec0_m); \ | 76 reg4_m = __msa_ilvl_w(reg4_m, reg4_m); \ |
| 77 vec0_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_v); \ | 77 reg5_m = reg0_m - reg5_m; \ |
| 78 reg8_m = (v4i32)__msa_ilvr_h(zero_m, (v8i16)vec0_m); \ | 78 reg6_m = reg1_m - reg6_m; \ |
| 79 reg9_m = (v4i32)__msa_ilvl_h(zero_m, (v8i16)vec0_m); \ | 79 reg2_m = reg0_m - reg2_m; \ |
| 80 reg0_m -= reg6_m * ub; \ | 80 reg3_m = reg1_m - reg3_m; \ |
| 81 reg1_m -= reg7_m * ub; \ | 81 reg7_m = reg0_m - reg7_m; \ |
| 82 reg2_m -= reg6_m * ug; \ | 82 reg4_m = reg1_m - reg4_m; \ |
| 83 reg3_m -= reg7_m * ug; \ | 83 reg5_m += bb; \ |
| 84 reg4_m -= reg8_m * vr; \ | 84 reg6_m += bb; \ |
| 85 reg5_m -= reg9_m * vr; \ | 85 reg7_m += bg; \ |
| 86 reg2_m -= reg8_m * vg; \ | 86 reg4_m += bg; \ |
| 87 reg3_m -= reg9_m * vg; \ | 87 reg2_m += br; \ |
| 88 reg0_m = __msa_srai_w(reg0_m, 6); \ | 88 reg3_m += br; \ |
| 89 reg1_m = __msa_srai_w(reg1_m, 6); \ | 89 reg5_m = __msa_srai_w(reg5_m, 6); \ |
| 90 reg2_m = __msa_srai_w(reg2_m, 6); \ | 90 reg6_m = __msa_srai_w(reg6_m, 6); \ |
| 91 reg3_m = __msa_srai_w(reg3_m, 6); \ | 91 reg7_m = __msa_srai_w(reg7_m, 6); \ |
| 92 reg4_m = __msa_srai_w(reg4_m, 6); \ | 92 reg4_m = __msa_srai_w(reg4_m, 6); \ |
| 93 reg5_m = __msa_srai_w(reg5_m, 6); \ | 93 reg2_m = __msa_srai_w(reg2_m, 6); \ |
| 94 reg0_m = __msa_maxi_s_w(reg0_m, 0); \ | 94 reg3_m = __msa_srai_w(reg3_m, 6); \ |
| 95 reg1_m = __msa_maxi_s_w(reg1_m, 0); \ | 95 reg5_m = __msa_maxi_s_w(reg5_m, 0); \ |
| 96 reg2_m = __msa_maxi_s_w(reg2_m, 0); \ | 96 reg6_m = __msa_maxi_s_w(reg6_m, 0); \ |
| 97 reg3_m = __msa_maxi_s_w(reg3_m, 0); \ | 97 reg7_m = __msa_maxi_s_w(reg7_m, 0); \ |
| 98 reg4_m = __msa_maxi_s_w(reg4_m, 0); \ | 98 reg4_m = __msa_maxi_s_w(reg4_m, 0); \ |
| 99 reg5_m = __msa_maxi_s_w(reg5_m, 0); \ | 99 reg2_m = __msa_maxi_s_w(reg2_m, 0); \ |
| 100 reg0_m = __msa_min_s_w(reg0_m, max_val_m); \ | 100 reg3_m = __msa_maxi_s_w(reg3_m, 0); \ |
| 101 reg1_m = __msa_min_s_w(reg1_m, max_val_m); \ | 101 reg5_m = __msa_min_s_w(max, reg5_m); \ |
| 102 reg2_m = __msa_min_s_w(reg2_m, max_val_m); \ | 102 reg6_m = __msa_min_s_w(max, reg6_m); \ |
| 103 reg3_m = __msa_min_s_w(reg3_m, max_val_m); \ | 103 reg7_m = __msa_min_s_w(max, reg7_m); \ |
| 104 reg4_m = __msa_min_s_w(reg4_m, max_val_m); \ | 104 reg4_m = __msa_min_s_w(max, reg4_m); \ |
| 105 reg5_m = __msa_min_s_w(reg5_m, max_val_m); \ | 105 reg2_m = __msa_min_s_w(max, reg2_m); \ |
| 106 out_b = __msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m); \ | 106 reg3_m = __msa_min_s_w(max, reg3_m); \ |
| 107 out_g = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ | 107 out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \ |
| 108 out_r = __msa_pckev_h((v8i16)reg5_m, (v8i16)reg4_m); \ | 108 out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \ |
| 109 out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ |
| 109 } | 110 } |
| 110 | 111 |
| 111 // Pack and Store 8 ARGB values. | 112 // Pack and Store 8 ARGB values. |
| 112 #define STOREARGB(in0, in1, in2, in3, pdst_argb) \ | 113 #define STOREARGB(in0, in1, in2, in3, pdst_argb) \ |
| 113 { \ | 114 { \ |
| 114 v8i16 vec0_m, vec1_m; \ | 115 v8i16 vec0_m, vec1_m; \ |
| 115 v16u8 dst0_m, dst1_m; \ | 116 v16u8 dst0_m, dst1_m; \ |
| 116 vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ | 117 vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ |
| 117 vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ | 118 vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ |
| 118 dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m); \ | 119 dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m); \ |
| (...skipping 86 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 205 void I422ToARGBRow_MSA(const uint8* src_y, | 206 void I422ToARGBRow_MSA(const uint8* src_y, |
| 206 const uint8* src_u, | 207 const uint8* src_u, |
| 207 const uint8* src_v, | 208 const uint8* src_v, |
| 208 uint8* rgb_buf, | 209 uint8* rgb_buf, |
| 209 const struct YuvConstants* yuvconstants, | 210 const struct YuvConstants* yuvconstants, |
| 210 int width) { | 211 int width) { |
| 211 int x; | 212 int x; |
| 212 v16u8 src0, src1, src2; | 213 v16u8 src0, src1, src2; |
| 213 v8i16 vec0, vec1, vec2; | 214 v8i16 vec0, vec1, vec2; |
| 214 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; | 215 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
| 216 v4i32 vec_ubvr, vec_ugvg; |
| 215 v16u8 const_255 = (v16u8)__msa_ldi_b(255); | 217 v16u8 const_255 = (v16u8)__msa_ldi_b(255); |
| 216 | 218 |
| 217 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, | 219 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
| 218 vec_br, vec_yg); | 220 vec_br, vec_yg); |
| 221 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
| 222 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); |
| 219 | 223 |
| 220 for (x = 0; x < width; x += 8) { | 224 for (x = 0; x < width; x += 8) { |
| 221 READYUV422(src_y, src_u, src_v, src0, src1, src2); | 225 READYUV422(src_y, src_u, src_v, src0, src1, src2); |
| 222 YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, | 226 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); |
| 223 vec_br, vec_yg, vec0, vec1, vec2); | 227 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
| 228 vec0, vec1, vec2); |
| 224 STOREARGB(vec0, vec1, vec2, const_255, rgb_buf); | 229 STOREARGB(vec0, vec1, vec2, const_255, rgb_buf); |
| 225 src_y += 8; | 230 src_y += 8; |
| 226 src_u += 4; | 231 src_u += 4; |
| 227 src_v += 4; | 232 src_v += 4; |
| 228 rgb_buf += 32; | 233 rgb_buf += 32; |
| 229 } | 234 } |
| 230 } | 235 } |
| 231 | 236 |
| 232 void I422ToRGBARow_MSA(const uint8* src_y, | 237 void I422ToRGBARow_MSA(const uint8* src_y, |
| 233 const uint8* src_u, | 238 const uint8* src_u, |
| 234 const uint8* src_v, | 239 const uint8* src_v, |
| 235 uint8* rgb_buf, | 240 uint8* rgb_buf, |
| 236 const struct YuvConstants* yuvconstants, | 241 const struct YuvConstants* yuvconstants, |
| 237 int width) { | 242 int width) { |
| 238 int x; | 243 int x; |
| 239 v16u8 src0, src1, src2; | 244 v16u8 src0, src1, src2; |
| 240 v8i16 vec0, vec1, vec2; | 245 v8i16 vec0, vec1, vec2; |
| 241 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; | 246 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
| 247 v4i32 vec_ubvr, vec_ugvg; |
| 242 v16u8 const_255 = (v16u8)__msa_ldi_b(255); | 248 v16u8 const_255 = (v16u8)__msa_ldi_b(255); |
| 243 | 249 |
| 244 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, | 250 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
| 245 vec_br, vec_yg); | 251 vec_br, vec_yg); |
| 252 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
| 253 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); |
| 246 | 254 |
| 247 for (x = 0; x < width; x += 8) { | 255 for (x = 0; x < width; x += 8) { |
| 248 READYUV422(src_y, src_u, src_v, src0, src1, src2); | 256 READYUV422(src_y, src_u, src_v, src0, src1, src2); |
| 249 YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, | 257 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); |
| 250 vec_br, vec_yg, vec0, vec1, vec2); | 258 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
| 259 vec0, vec1, vec2); |
| 251 STOREARGB(const_255, vec0, vec1, vec2, rgb_buf); | 260 STOREARGB(const_255, vec0, vec1, vec2, rgb_buf); |
| 252 src_y += 8; | 261 src_y += 8; |
| 253 src_u += 4; | 262 src_u += 4; |
| 254 src_v += 4; | 263 src_v += 4; |
| 255 rgb_buf += 32; | 264 rgb_buf += 32; |
| 256 } | 265 } |
| 257 } | 266 } |
| 258 | 267 |
| 259 void I422AlphaToARGBRow_MSA(const uint8* src_y, | 268 void I422AlphaToARGBRow_MSA(const uint8* src_y, |
| 260 const uint8* src_u, | 269 const uint8* src_u, |
| 261 const uint8* src_v, | 270 const uint8* src_v, |
| 262 const uint8* src_a, | 271 const uint8* src_a, |
| 263 uint8* rgb_buf, | 272 uint8* rgb_buf, |
| 264 const struct YuvConstants* yuvconstants, | 273 const struct YuvConstants* yuvconstants, |
| 265 int width) { | 274 int width) { |
| 266 int x; | 275 int x; |
| 267 int64 data_a; | 276 int64 data_a; |
| 268 v16u8 src0, src1, src2, src3; | 277 v16u8 src0, src1, src2, src3; |
| 269 v8i16 vec0, vec1, vec2; | 278 v8i16 vec0, vec1, vec2; |
| 270 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; | 279 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
| 280 v4i32 vec_ubvr, vec_ugvg; |
| 271 v4i32 zero = {0}; | 281 v4i32 zero = {0}; |
| 272 | 282 |
| 273 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, | 283 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
| 274 vec_br, vec_yg); | 284 vec_br, vec_yg); |
| 285 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
| 286 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); |
| 275 | 287 |
| 276 for (x = 0; x < width; x += 8) { | 288 for (x = 0; x < width; x += 8) { |
| 277 data_a = LD(src_a); | 289 data_a = LD(src_a); |
| 278 READYUV422(src_y, src_u, src_v, src0, src1, src2); | 290 READYUV422(src_y, src_u, src_v, src0, src1, src2); |
| 291 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); |
| 279 src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a); | 292 src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a); |
| 280 YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, | 293 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
| 281 vec_br, vec_yg, vec0, vec1, vec2); | 294 vec0, vec1, vec2); |
| 282 src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3); | 295 src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3); |
| 283 STOREARGB(vec0, vec1, vec2, src3, rgb_buf); | 296 STOREARGB(vec0, vec1, vec2, src3, rgb_buf); |
| 284 src_y += 8; | 297 src_y += 8; |
| 285 src_u += 4; | 298 src_u += 4; |
| 286 src_v += 4; | 299 src_v += 4; |
| 287 src_a += 8; | 300 src_a += 8; |
| 288 rgb_buf += 32; | 301 rgb_buf += 32; |
| 289 } | 302 } |
| 290 } | 303 } |
| 291 | 304 |
| 292 void I422ToRGB24Row_MSA(const uint8* src_y, | 305 void I422ToRGB24Row_MSA(const uint8* src_y, |
| 293 const uint8* src_u, | 306 const uint8* src_u, |
| 294 const uint8* src_v, | 307 const uint8* src_v, |
| 295 uint8* rgb_buf, | 308 uint8* rgb_buf, |
| 296 const struct YuvConstants* yuvconstants, | 309 const struct YuvConstants* yuvconstants, |
| 297 int32 width) { | 310 int32 width) { |
| 298 int x; | 311 int x; |
| 299 int64 data_u, data_v; | 312 int64 data_u, data_v; |
| 300 v16u8 src0, src1, src2, src3, src4, src5, dst0, dst1, dst2; | 313 v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2; |
| 301 v8i16 vec0, vec1, vec2, vec3, vec4, vec5; | 314 v8i16 vec0, vec1, vec2, vec3, vec4, vec5; |
| 302 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; | 315 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
| 316 v4i32 vec_ubvr, vec_ugvg; |
| 303 v16u8 reg0, reg1, reg2, reg3; | 317 v16u8 reg0, reg1, reg2, reg3; |
| 304 v2i64 zero = {0}; | 318 v2i64 zero = {0}; |
| 305 v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10}; | 319 v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10}; |
| 306 v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10}; | 320 v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10}; |
| 307 v16i8 shuffler2 = {26, 6, 7, 27, 8, 9, 28, 10, | 321 v16i8 shuffler2 = {26, 6, 7, 27, 8, 9, 28, 10, |
| 308 11, 29, 12, 13, 30, 14, 15, 31}; | 322 11, 29, 12, 13, 30, 14, 15, 31}; |
| 309 | 323 |
| 310 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, | 324 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
| 311 vec_br, vec_yg); | 325 vec_br, vec_yg); |
| 326 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
| 327 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); |
| 312 | 328 |
| 313 for (x = 0; x < width; x += 16) { | 329 for (x = 0; x < width; x += 16) { |
| 314 src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0); | 330 src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0); |
| 315 data_u = LD(src_u); | 331 data_u = LD(src_u); |
| 316 data_v = LD(src_v); | 332 data_v = LD(src_v); |
| 317 src1 = (v16u8)__msa_insert_d(zero, 0, data_u); | 333 src1 = (v16u8)__msa_insert_d(zero, 0, data_u); |
| 318 src2 = (v16u8)__msa_insert_d(zero, 0, data_v); | 334 src2 = (v16u8)__msa_insert_d(zero, 0, data_v); |
| 335 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); |
| 319 src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8); | 336 src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8); |
| 320 src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 4); | 337 src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8); |
| 321 src5 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4); | 338 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
| 322 YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, | 339 vec0, vec1, vec2); |
| 323 vec_br, vec_yg, vec0, vec1, vec2); | 340 YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
| 324 YUVTORGB(src3, src4, src5, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, | 341 vec3, vec4, vec5); |
| 325 vec_br, vec_yg, vec3, vec4, vec5); | |
| 326 reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); | 342 reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); |
| 327 reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3); | 343 reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3); |
| 328 reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2); | 344 reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2); |
| 329 reg1 = (v16u8)__msa_sldi_b((v16i8)reg2, (v16i8)reg0, 11); | 345 reg1 = (v16u8)__msa_sldi_b((v16i8)reg2, (v16i8)reg0, 11); |
| 330 dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0); | 346 dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0); |
| 331 dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1); | 347 dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1); |
| 332 dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2); | 348 dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2); |
| 333 ST_UB2(dst0, dst1, rgb_buf, 16); | 349 ST_UB2(dst0, dst1, rgb_buf, 16); |
| 334 ST_UB(dst2, (rgb_buf + 32)); | 350 ST_UB(dst2, (rgb_buf + 32)); |
| 335 src_y += 16; | 351 src_y += 16; |
| 336 src_u += 8; | 352 src_u += 8; |
| 337 src_v += 8; | 353 src_v += 8; |
| 338 rgb_buf += 48; | 354 rgb_buf += 48; |
| 339 } | 355 } |
| 340 } | 356 } |
| 341 | 357 |
| 342 // TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R. | 358 // TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R. |
| 343 void I422ToRGB565Row_MSA(const uint8* src_y, | 359 void I422ToRGB565Row_MSA(const uint8* src_y, |
| 344 const uint8* src_u, | 360 const uint8* src_u, |
| 345 const uint8* src_v, | 361 const uint8* src_v, |
| 346 uint8* dst_rgb565, | 362 uint8* dst_rgb565, |
| 347 const struct YuvConstants* yuvconstants, | 363 const struct YuvConstants* yuvconstants, |
| 348 int width) { | 364 int width) { |
| 349 int x; | 365 int x; |
| 350 v16u8 src0, src1, src2, dst0; | 366 v16u8 src0, src1, src2, dst0; |
| 351 v8i16 vec0, vec1, vec2; | 367 v8i16 vec0, vec1, vec2; |
| 352 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; | 368 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
| 369 v4i32 vec_ubvr, vec_ugvg; |
| 353 | 370 |
| 354 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, | 371 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
| 355 vec_br, vec_yg); | 372 vec_br, vec_yg); |
| 373 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
| 374 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); |
| 356 | 375 |
| 357 for (x = 0; x < width; x += 8) { | 376 for (x = 0; x < width; x += 8) { |
| 358 READYUV422(src_y, src_u, src_v, src0, src1, src2); | 377 READYUV422(src_y, src_u, src_v, src0, src1, src2); |
| 359 YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, | 378 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); |
| 360 vec_br, vec_yg, vec0, vec2, vec1); | 379 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
| 380 vec0, vec2, vec1); |
| 361 vec0 = __msa_srai_h(vec0, 3); | 381 vec0 = __msa_srai_h(vec0, 3); |
| 362 vec1 = __msa_srai_h(vec1, 3); | 382 vec1 = __msa_srai_h(vec1, 3); |
| 363 vec2 = __msa_srai_h(vec2, 2); | 383 vec2 = __msa_srai_h(vec2, 2); |
| 364 vec1 = __msa_slli_h(vec1, 11); | 384 vec1 = __msa_slli_h(vec1, 11); |
| 365 vec2 = __msa_slli_h(vec2, 5); | 385 vec2 = __msa_slli_h(vec2, 5); |
| 366 vec0 |= vec1; | 386 vec0 |= vec1; |
| 367 dst0 = (v16u8)(vec2 | vec0); | 387 dst0 = (v16u8)(vec2 | vec0); |
| 368 ST_UB(dst0, dst_rgb565); | 388 ST_UB(dst0, dst_rgb565); |
| 369 src_y += 8; | 389 src_y += 8; |
| 370 src_u += 4; | 390 src_u += 4; |
| 371 src_v += 4; | 391 src_v += 4; |
| 372 dst_rgb565 += 16; | 392 dst_rgb565 += 16; |
| 373 } | 393 } |
| 374 } | 394 } |
| 375 | 395 |
| 376 // TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G. | 396 // TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G. |
| 377 void I422ToARGB4444Row_MSA(const uint8* src_y, | 397 void I422ToARGB4444Row_MSA(const uint8* src_y, |
| 378 const uint8* src_u, | 398 const uint8* src_u, |
| 379 const uint8* src_v, | 399 const uint8* src_v, |
| 380 uint8* dst_argb4444, | 400 uint8* dst_argb4444, |
| 381 const struct YuvConstants* yuvconstants, | 401 const struct YuvConstants* yuvconstants, |
| 382 int width) { | 402 int width) { |
| 383 int x; | 403 int x; |
| 384 v16u8 src0, src1, src2, dst0; | 404 v16u8 src0, src1, src2, dst0; |
| 385 v8i16 vec0, vec1, vec2; | 405 v8i16 vec0, vec1, vec2; |
| 386 v8u16 reg0, reg1, reg2; | 406 v8u16 reg0, reg1, reg2; |
| 387 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; | 407 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
| 408 v4i32 vec_ubvr, vec_ugvg; |
| 388 v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000); | 409 v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000); |
| 389 | 410 |
| 390 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, | 411 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
| 391 vec_br, vec_yg); | 412 vec_br, vec_yg); |
| 413 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
| 414 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); |
| 392 | 415 |
| 393 for (x = 0; x < width; x += 8) { | 416 for (x = 0; x < width; x += 8) { |
| 394 READYUV422(src_y, src_u, src_v, src0, src1, src2); | 417 READYUV422(src_y, src_u, src_v, src0, src1, src2); |
| 395 YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, | 418 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); |
| 396 vec_br, vec_yg, vec0, vec1, vec2); | 419 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
| 420 vec0, vec1, vec2); |
| 397 reg0 = (v8u16)__msa_srai_h(vec0, 4); | 421 reg0 = (v8u16)__msa_srai_h(vec0, 4); |
| 398 reg1 = (v8u16)__msa_srai_h(vec1, 4); | 422 reg1 = (v8u16)__msa_srai_h(vec1, 4); |
| 399 reg2 = (v8u16)__msa_srai_h(vec2, 4); | 423 reg2 = (v8u16)__msa_srai_h(vec2, 4); |
| 400 reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4); | 424 reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4); |
| 401 reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8); | 425 reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8); |
| 402 reg1 |= const_0xF000; | 426 reg1 |= const_0xF000; |
| 403 reg0 |= reg2; | 427 reg0 |= reg2; |
| 404 dst0 = (v16u8)(reg1 | reg0); | 428 dst0 = (v16u8)(reg1 | reg0); |
| 405 ST_UB(dst0, dst_argb4444); | 429 ST_UB(dst0, dst_argb4444); |
| 406 src_y += 8; | 430 src_y += 8; |
| 407 src_u += 4; | 431 src_u += 4; |
| 408 src_v += 4; | 432 src_v += 4; |
| 409 dst_argb4444 += 16; | 433 dst_argb4444 += 16; |
| 410 } | 434 } |
| 411 } | 435 } |
| 412 | 436 |
| 413 void I422ToARGB1555Row_MSA(const uint8* src_y, | 437 void I422ToARGB1555Row_MSA(const uint8* src_y, |
| 414 const uint8* src_u, | 438 const uint8* src_u, |
| 415 const uint8* src_v, | 439 const uint8* src_v, |
| 416 uint8* dst_argb1555, | 440 uint8* dst_argb1555, |
| 417 const struct YuvConstants* yuvconstants, | 441 const struct YuvConstants* yuvconstants, |
| 418 int width) { | 442 int width) { |
| 419 int x; | 443 int x; |
| 420 v16u8 src0, src1, src2, dst0; | 444 v16u8 src0, src1, src2, dst0; |
| 421 v8i16 vec0, vec1, vec2; | 445 v8i16 vec0, vec1, vec2; |
| 422 v8u16 reg0, reg1, reg2; | 446 v8u16 reg0, reg1, reg2; |
| 423 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; | 447 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
| 448 v4i32 vec_ubvr, vec_ugvg; |
| 424 v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000); | 449 v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000); |
| 425 | 450 |
| 426 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, | 451 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
| 427 vec_br, vec_yg); | 452 vec_br, vec_yg); |
| 453 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
| 454 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); |
| 428 | 455 |
| 429 for (x = 0; x < width; x += 8) { | 456 for (x = 0; x < width; x += 8) { |
| 430 READYUV422(src_y, src_u, src_v, src0, src1, src2); | 457 READYUV422(src_y, src_u, src_v, src0, src1, src2); |
| 431 YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, | 458 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); |
| 432 vec_br, vec_yg, vec0, vec1, vec2); | 459 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
| 460 vec0, vec1, vec2); |
| 433 reg0 = (v8u16)__msa_srai_h(vec0, 3); | 461 reg0 = (v8u16)__msa_srai_h(vec0, 3); |
| 434 reg1 = (v8u16)__msa_srai_h(vec1, 3); | 462 reg1 = (v8u16)__msa_srai_h(vec1, 3); |
| 435 reg2 = (v8u16)__msa_srai_h(vec2, 3); | 463 reg2 = (v8u16)__msa_srai_h(vec2, 3); |
| 436 reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5); | 464 reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5); |
| 437 reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10); | 465 reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10); |
| 438 reg1 |= const_0x8000; | 466 reg1 |= const_0x8000; |
| 439 reg0 |= reg2; | 467 reg0 |= reg2; |
| 440 dst0 = (v16u8)(reg1 | reg0); | 468 dst0 = (v16u8)(reg1 | reg0); |
| 441 ST_UB(dst0, dst_argb1555); | 469 ST_UB(dst0, dst_argb1555); |
| 442 src_y += 8; | 470 src_y += 8; |
| (...skipping 1573 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2016 res1 = __msa_copy_u_d((v2i64)dst0, 1); | 2044 res1 = __msa_copy_u_d((v2i64)dst0, 1); |
| 2017 SD(res0, dst_u); | 2045 SD(res0, dst_u); |
| 2018 SD(res1, dst_v); | 2046 SD(res1, dst_v); |
| 2019 t += 48; | 2047 t += 48; |
| 2020 s += 48; | 2048 s += 48; |
| 2021 dst_u += 8; | 2049 dst_u += 8; |
| 2022 dst_v += 8; | 2050 dst_v += 8; |
| 2023 } | 2051 } |
| 2024 } | 2052 } |
| 2025 | 2053 |
| 2054 void NV12ToARGBRow_MSA(const uint8* src_y, |
| 2055 const uint8* src_uv, |
| 2056 uint8* rgb_buf, |
| 2057 const struct YuvConstants* yuvconstants, |
| 2058 int width) { |
| 2059 int x; |
| 2060 uint64 val0, val1; |
| 2061 v16u8 src0, src1, res0, res1, dst0, dst1; |
| 2062 v8i16 vec0, vec1, vec2; |
| 2063 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
| 2064 v4i32 vec_ubvr, vec_ugvg; |
| 2065 v16u8 zero = {0}; |
| 2066 v16u8 const_255 = (v16u8)__msa_ldi_b(255); |
| 2067 |
| 2068 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
| 2069 vec_br, vec_yg); |
| 2070 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
| 2071 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); |
| 2072 |
| 2073 for (x = 0; x < width; x += 8) { |
| 2074 val0 = LD(src_y); |
| 2075 val1 = LD(src_uv); |
| 2076 src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); |
| 2077 src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); |
| 2078 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
| 2079 vec0, vec1, vec2); |
| 2080 res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0); |
| 2081 res1 = (v16u8)__msa_ilvev_b((v16i8)const_255, (v16i8)vec1); |
| 2082 dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); |
| 2083 dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); |
| 2084 ST_UB2(dst0, dst1, rgb_buf, 16); |
| 2085 src_y += 8; |
| 2086 src_uv += 8; |
| 2087 rgb_buf += 32; |
| 2088 } |
| 2089 } |
| 2090 |
| 2091 void NV12ToRGB565Row_MSA(const uint8* src_y, |
| 2092 const uint8* src_uv, |
| 2093 uint8* rgb_buf, |
| 2094 const struct YuvConstants* yuvconstants, |
| 2095 int width) { |
| 2096 int x; |
| 2097 uint64 val0, val1; |
| 2098 v16u8 src0, src1, dst0; |
| 2099 v8i16 vec0, vec1, vec2; |
| 2100 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
| 2101 v4i32 vec_ubvr, vec_ugvg; |
| 2102 v16u8 zero = {0}; |
| 2103 |
| 2104 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
| 2105 vec_br, vec_yg); |
| 2106 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
| 2107 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); |
| 2108 |
| 2109 for (x = 0; x < width; x += 8) { |
| 2110 val0 = LD(src_y); |
| 2111 val1 = LD(src_uv); |
| 2112 src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); |
| 2113 src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); |
| 2114 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
| 2115 vec0, vec1, vec2); |
| 2116 vec0 = vec0 >> 3; |
| 2117 vec1 = (vec1 >> 2) << 5; |
| 2118 vec2 = (vec2 >> 3) << 11; |
| 2119 dst0 = (v16u8)(vec0 | vec1 | vec2); |
| 2120 ST_UB(dst0, rgb_buf); |
| 2121 src_y += 8; |
| 2122 src_uv += 8; |
| 2123 rgb_buf += 16; |
| 2124 } |
| 2125 } |
| 2126 |
| 2127 void NV21ToARGBRow_MSA(const uint8* src_y, |
| 2128 const uint8* src_vu, |
| 2129 uint8* rgb_buf, |
| 2130 const struct YuvConstants* yuvconstants, |
| 2131 int width) { |
| 2132 int x; |
| 2133 uint64 val0, val1; |
| 2134 v16u8 src0, src1, res0, res1, dst0, dst1; |
| 2135 v8i16 vec0, vec1, vec2; |
| 2136 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; |
| 2137 v4i32 vec_ubvr, vec_ugvg; |
| 2138 v16u8 const_255 = (v16u8)__msa_ldi_b(255); |
| 2139 v16u8 zero = {0}; |
| 2140 v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; |
| 2141 |
| 2142 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, |
| 2143 vec_br, vec_yg); |
| 2144 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); |
| 2145 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); |
| 2146 |
| 2147 for (x = 0; x < width; x += 8) { |
| 2148 val0 = LD(src_y); |
| 2149 val1 = LD(src_vu); |
| 2150 src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); |
| 2151 src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); |
| 2152 src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1); |
| 2153 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, |
| 2154 vec0, vec1, vec2); |
| 2155 res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0); |
| 2156 res1 = (v16u8)__msa_ilvev_b((v16i8)const_255, (v16i8)vec1); |
| 2157 dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); |
| 2158 dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); |
| 2159 ST_UB2(dst0, dst1, rgb_buf, 16); |
| 2160 src_y += 8; |
| 2161 src_vu += 8; |
| 2162 rgb_buf += 32; |
| 2163 } |
| 2164 } |
| 2165 |
| 2166 void SobelRow_MSA(const uint8* src_sobelx, |
| 2167 const uint8* src_sobely, |
| 2168 uint8* dst_argb, |
| 2169 int width) { |
| 2170 int x; |
| 2171 v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3; |
| 2172 v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16}; |
| 2173 v16i8 const_0x4 = __msa_ldi_b(0x4); |
| 2174 v16i8 mask1 = mask0 + const_0x4; |
| 2175 v16i8 mask2 = mask1 + const_0x4; |
| 2176 v16i8 mask3 = mask2 + const_0x4; |
| 2177 v16u8 const_0xFF = (v16u8)__msa_ldi_b(0xFF); |
| 2178 |
| 2179 for (x = 0; x < width; x += 16) { |
| 2180 src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0); |
| 2181 src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0); |
| 2182 vec0 = __msa_adds_u_b(src0, src1); |
| 2183 dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)const_0xFF, (v16i8)vec0); |
| 2184 dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)const_0xFF, (v16i8)vec0); |
| 2185 dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)const_0xFF, (v16i8)vec0); |
| 2186 dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)const_0xFF, (v16i8)vec0); |
| 2187 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); |
| 2188 src_sobelx += 16; |
| 2189 src_sobely += 16; |
| 2190 dst_argb += 64; |
| 2191 } |
| 2192 } |
| 2193 |
| 2194 void SobelToPlaneRow_MSA(const uint8* src_sobelx, |
| 2195 const uint8* src_sobely, |
| 2196 uint8* dst_y, |
| 2197 int width) { |
| 2198 int x; |
| 2199 v16u8 src0, src1, src2, src3, dst0, dst1; |
| 2200 |
| 2201 for (x = 0; x < width; x += 32) { |
| 2202 src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0); |
| 2203 src1 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 16); |
| 2204 src2 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0); |
| 2205 src3 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 16); |
| 2206 dst0 = __msa_adds_u_b(src0, src2); |
| 2207 dst1 = __msa_adds_u_b(src1, src3); |
| 2208 ST_UB2(dst0, dst1, dst_y, 16); |
| 2209 src_sobelx += 32; |
| 2210 src_sobely += 32; |
| 2211 dst_y += 32; |
| 2212 } |
| 2213 } |
| 2214 |
| 2215 void SobelXYRow_MSA(const uint8* src_sobelx, |
| 2216 const uint8* src_sobely, |
| 2217 uint8* dst_argb, |
| 2218 int width) { |
| 2219 int x; |
| 2220 v16u8 src0, src1, vec0, vec1, vec2; |
| 2221 v16u8 reg0, reg1, dst0, dst1, dst2, dst3; |
| 2222 v16u8 const_0xFF = (v16u8)__msa_ldi_b(0xFF); |
| 2223 |
| 2224 for (x = 0; x < width; x += 16) { |
| 2225 src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0); |
| 2226 src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0); |
| 2227 vec0 = __msa_adds_u_b(src0, src1); |
| 2228 vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1); |
| 2229 vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1); |
| 2230 reg0 = (v16u8)__msa_ilvr_b((v16i8)const_0xFF, (v16i8)vec0); |
| 2231 reg1 = (v16u8)__msa_ilvl_b((v16i8)const_0xFF, (v16i8)vec0); |
| 2232 dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1); |
| 2233 dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1); |
| 2234 dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2); |
| 2235 dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2); |
| 2236 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); |
| 2237 src_sobelx += 16; |
| 2238 src_sobely += 16; |
| 2239 dst_argb += 64; |
| 2240 } |
| 2241 } |
| 2242 |
| 2026 #ifdef __cplusplus | 2243 #ifdef __cplusplus |
| 2027 } // extern "C" | 2244 } // extern "C" |
| 2028 } // namespace libyuv | 2245 } // namespace libyuv |
| 2029 #endif | 2246 #endif |
| 2030 | 2247 |
| 2031 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) | 2248 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) |
| OLD | NEW |