| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include "libyuv/rotate_row.h" | 11 #include "libyuv/rotate_row.h" |
| 12 | 12 |
| 13 // This module is for GCC MSA | 13 // This module is for GCC MSA |
| 14 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) | 14 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) |
| 15 #include "libyuv/macros_msa.h" | 15 #include "libyuv/macros_msa.h" |
| 16 | 16 |
| 17 #ifdef __cplusplus | 17 #ifdef __cplusplus |
| 18 namespace libyuv { | 18 namespace libyuv { |
| 19 extern "C" { | 19 extern "C" { |
| 20 #endif | 20 #endif |
| 21 | 21 |
| 22 void TransposeWx8_MSA(const uint8_t* src, | 22 #define ILVRL_B(in0, in1, in2, in3, out0, out1, out2, out3) \ |
| 23 int src_stride, | 23 { \ |
| 24 uint8_t* dst, | 24 out0 = (v16u8)__msa_ilvr_b((v16i8)in1, (v16i8)in0); \ |
| 25 int dst_stride, | 25 out1 = (v16u8)__msa_ilvl_b((v16i8)in1, (v16i8)in0); \ |
| 26 int width) { | 26 out2 = (v16u8)__msa_ilvr_b((v16i8)in3, (v16i8)in2); \ |
| 27 out3 = (v16u8)__msa_ilvl_b((v16i8)in3, (v16i8)in2); \ |
| 28 } |
| 29 |
| 30 #define ILVRL_H(in0, in1, in2, in3, out0, out1, out2, out3) \ |
| 31 { \ |
| 32 out0 = (v16u8)__msa_ilvr_h((v8i16)in1, (v8i16)in0); \ |
| 33 out1 = (v16u8)__msa_ilvl_h((v8i16)in1, (v8i16)in0); \ |
| 34 out2 = (v16u8)__msa_ilvr_h((v8i16)in3, (v8i16)in2); \ |
| 35 out3 = (v16u8)__msa_ilvl_h((v8i16)in3, (v8i16)in2); \ |
| 36 } |
| 37 |
| 38 #define ILVRL_W(in0, in1, in2, in3, out0, out1, out2, out3) \ |
| 39 { \ |
| 40 out0 = (v16u8)__msa_ilvr_w((v4i32)in1, (v4i32)in0); \ |
| 41 out1 = (v16u8)__msa_ilvl_w((v4i32)in1, (v4i32)in0); \ |
| 42 out2 = (v16u8)__msa_ilvr_w((v4i32)in3, (v4i32)in2); \ |
| 43 out3 = (v16u8)__msa_ilvl_w((v4i32)in3, (v4i32)in2); \ |
| 44 } |
| 45 |
| 46 #define ILVRL_D(in0, in1, in2, in3, out0, out1, out2, out3) \ |
| 47 { \ |
| 48 out0 = (v16u8)__msa_ilvr_d((v2i64)in1, (v2i64)in0); \ |
| 49 out1 = (v16u8)__msa_ilvl_d((v2i64)in1, (v2i64)in0); \ |
| 50 out2 = (v16u8)__msa_ilvr_d((v2i64)in3, (v2i64)in2); \ |
| 51 out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2); \ |
| 52 } |
| 53 |
| 54 void TransposeWx16_C(const uint8* src, |
| 55 int src_stride, |
| 56 uint8* dst, |
| 57 int dst_stride, |
| 58 int width) { |
| 59 TransposeWx8_C(src, src_stride, dst, dst_stride, width); |
| 60 TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride, |
| 61 width); |
| 62 } |
| 63 |
| 64 void TransposeUVWx16_C(const uint8* src, |
| 65 int src_stride, |
| 66 uint8* dst_a, |
| 67 int dst_stride_a, |
| 68 uint8* dst_b, |
| 69 int dst_stride_b, |
| 70 int width) { |
| 71 TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, |
| 72 width); |
| 73 TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8), |
| 74 dst_stride_a, (dst_b + 8), dst_stride_b, width); |
| 75 } |
| 76 |
| 77 void TransposeWx16_MSA(const uint8* src, |
| 78 int src_stride, |
| 79 uint8* dst, |
| 80 int dst_stride, |
| 81 int width) { |
| 27 int x; | 82 int x; |
| 28 uint64_t val0, val1, val2, val3; | 83 const uint8* s; |
| 29 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; | 84 v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; |
| 30 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; | |
| 31 v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; | 85 v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; |
| 32 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; | 86 v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9; |
| 33 | 87 |
| 34 for (x = 0; x < width; x += 16) { | 88 for (x = 0; x < width; x += 16) { |
| 35 src0 = (v16u8)__msa_ld_b((v16i8*)src, 0); | 89 s = src; |
| 36 src1 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride), 0); | 90 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 37 src2 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 2), 0); | 91 s += src_stride; |
| 38 src3 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 3), 0); | 92 src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 39 src4 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 4), 0); | 93 s += src_stride; |
| 40 src5 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 5), 0); | 94 src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 41 src6 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 6), 0); | 95 s += src_stride; |
| 42 src7 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 7), 0); | 96 src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 43 vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0); | 97 s += src_stride; |
| 44 vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1); | 98 ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); |
| 45 vec2 = (v16u8)__msa_ilvr_b((v16i8)src6, (v16i8)src4); | 99 ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); |
| 46 vec3 = (v16u8)__msa_ilvr_b((v16i8)src7, (v16i8)src5); | 100 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 47 vec4 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0); | 101 s += src_stride; |
| 48 vec5 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1); | 102 src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 49 vec6 = (v16u8)__msa_ilvl_b((v16i8)src6, (v16i8)src4); | 103 s += src_stride; |
| 50 vec7 = (v16u8)__msa_ilvl_b((v16i8)src7, (v16i8)src5); | 104 src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 51 reg0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0); | 105 s += src_stride; |
| 52 reg1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0); | 106 src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 53 reg2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec2); | 107 s += src_stride; |
| 54 reg3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec2); | 108 ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); |
| 55 reg4 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4); | 109 ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); |
| 56 reg5 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4); | 110 ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3); |
| 57 reg6 = (v16u8)__msa_ilvr_b((v16i8)vec7, (v16i8)vec6); | 111 ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7); |
| 58 reg7 = (v16u8)__msa_ilvl_b((v16i8)vec7, (v16i8)vec6); | 112 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 59 dst0 = (v16u8)__msa_ilvr_w((v4i32)reg2, (v4i32)reg0); | 113 s += src_stride; |
| 60 dst1 = (v16u8)__msa_ilvl_w((v4i32)reg2, (v4i32)reg0); | 114 src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 61 dst2 = (v16u8)__msa_ilvr_w((v4i32)reg3, (v4i32)reg1); | 115 s += src_stride; |
| 62 dst3 = (v16u8)__msa_ilvl_w((v4i32)reg3, (v4i32)reg1); | 116 src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 63 dst4 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg4); | 117 s += src_stride; |
| 64 dst5 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg4); | 118 src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 65 dst6 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg5); | 119 s += src_stride; |
| 66 dst7 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg5); | 120 ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); |
| 67 val0 = __msa_copy_s_d((v2i64)dst0, 0); | 121 ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); |
| 68 val1 = __msa_copy_s_d((v2i64)dst0, 1); | 122 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 69 val2 = __msa_copy_s_d((v2i64)dst1, 0); | 123 s += src_stride; |
| 70 val3 = __msa_copy_s_d((v2i64)dst1, 1); | 124 src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 71 SD(val0, dst); | 125 s += src_stride; |
| 72 SD(val1, dst + dst_stride); | 126 src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 73 SD(val2, dst + dst_stride * 2); | 127 s += src_stride; |
| 74 SD(val3, dst + dst_stride * 3); | 128 src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 75 dst += dst_stride * 4; | 129 s += src_stride; |
| 76 val0 = __msa_copy_s_d((v2i64)dst2, 0); | 130 ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); |
| 77 val1 = __msa_copy_s_d((v2i64)dst2, 1); | 131 ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); |
| 78 val2 = __msa_copy_s_d((v2i64)dst3, 0); | 132 res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0); |
| 79 val3 = __msa_copy_s_d((v2i64)dst3, 1); | 133 res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0); |
| 80 SD(val0, dst); | 134 ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3); |
| 81 SD(val1, dst + dst_stride); | 135 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); |
| 82 SD(val2, dst + dst_stride * 2); | 136 dst += dst_stride * 4; |
| 83 SD(val3, dst + dst_stride * 3); | 137 res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1); |
| 84 dst += dst_stride * 4; | 138 res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1); |
| 85 val0 = __msa_copy_s_d((v2i64)dst4, 0); | 139 ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3); |
| 86 val1 = __msa_copy_s_d((v2i64)dst4, 1); | 140 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); |
| 87 val2 = __msa_copy_s_d((v2i64)dst5, 0); | 141 dst += dst_stride * 4; |
| 88 val3 = __msa_copy_s_d((v2i64)dst5, 1); | 142 res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2); |
| 89 SD(val0, dst); | 143 res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2); |
| 90 SD(val1, dst + dst_stride); | 144 ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3); |
| 91 SD(val2, dst + dst_stride * 2); | 145 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); |
| 92 SD(val3, dst + dst_stride * 3); | 146 dst += dst_stride * 4; |
| 93 dst += dst_stride * 4; | 147 res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3); |
| 94 val0 = __msa_copy_s_d((v2i64)dst6, 0); | 148 res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3); |
| 95 val1 = __msa_copy_s_d((v2i64)dst6, 1); | 149 ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3); |
| 96 val2 = __msa_copy_s_d((v2i64)dst7, 0); | 150 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); |
| 97 val3 = __msa_copy_s_d((v2i64)dst7, 1); | |
| 98 SD(val0, dst); | |
| 99 SD(val1, dst + dst_stride); | |
| 100 SD(val2, dst + dst_stride * 2); | |
| 101 SD(val3, dst + dst_stride * 3); | |
| 102 dst += dst_stride * 4; | |
| 103 src += 16; | 151 src += 16; |
| 104 } | 152 dst += dst_stride * 4; |
| 105 } | 153 } |
| 106 | 154 } |
| 107 void TransposeUVWx8_MSA(const uint8_t* src, | 155 |
| 108 int src_stride, | 156 void TransposeUVWx16_MSA(const uint8* src, |
| 109 uint8_t* dst_a, | 157 int src_stride, |
| 110 int dst_stride_a, | 158 uint8* dst_a, |
| 111 uint8_t* dst_b, | 159 int dst_stride_a, |
| 112 int dst_stride_b, | 160 uint8* dst_b, |
| 113 int width) { | 161 int dst_stride_b, |
| 162 int width) { |
| 114 int x; | 163 int x; |
| 115 uint64_t val0, val1, val2, val3; | 164 const uint8* s; |
| 116 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; | 165 v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; |
| 117 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; | |
| 118 v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; | 166 v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; |
| 119 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; | 167 v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9; |
| 120 | 168 |
| 121 for (x = 0; x < width; x += 8) { | 169 for (x = 0; x < width; x += 8) { |
| 122 src0 = (v16u8)__msa_ld_b((v16i8*)src, 0); | 170 s = src; |
| 123 src1 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride), 0); | 171 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 124 src2 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 2), 0); | 172 s += src_stride; |
| 125 src3 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 3), 0); | 173 src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 126 src4 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 4), 0); | 174 s += src_stride; |
| 127 src5 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 5), 0); | 175 src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 128 src6 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 6), 0); | 176 s += src_stride; |
| 129 src7 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 7), 0); | 177 src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 130 vec0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0); | 178 s += src_stride; |
| 131 vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src2); | 179 ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); |
| 132 vec2 = (v16u8)__msa_ilvr_b((v16i8)src5, (v16i8)src4); | 180 ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); |
| 133 vec3 = (v16u8)__msa_ilvr_b((v16i8)src7, (v16i8)src6); | 181 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 134 vec4 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0); | 182 s += src_stride; |
| 135 vec5 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src2); | 183 src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 136 vec6 = (v16u8)__msa_ilvl_b((v16i8)src5, (v16i8)src4); | 184 s += src_stride; |
| 137 vec7 = (v16u8)__msa_ilvl_b((v16i8)src7, (v16i8)src6); | 185 src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 138 reg0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0); | 186 s += src_stride; |
| 139 reg1 = (v16u8)__msa_ilvr_h((v8i16)vec3, (v8i16)vec2); | 187 src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 140 reg2 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0); | 188 s += src_stride; |
| 141 reg3 = (v16u8)__msa_ilvl_h((v8i16)vec3, (v8i16)vec2); | 189 ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); |
| 142 reg4 = (v16u8)__msa_ilvr_h((v8i16)vec5, (v8i16)vec4); | 190 ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); |
| 143 reg5 = (v16u8)__msa_ilvr_h((v8i16)vec7, (v8i16)vec6); | 191 ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3); |
| 144 reg6 = (v16u8)__msa_ilvl_h((v8i16)vec5, (v8i16)vec5); | 192 ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7); |
| 145 reg7 = (v16u8)__msa_ilvl_h((v8i16)vec7, (v8i16)vec6); | 193 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 146 dst0 = (v16u8)__msa_ilvr_w((v4i32)reg1, (v4i32)reg0); | 194 s += src_stride; |
| 147 dst1 = (v16u8)__msa_ilvl_w((v4i32)reg1, (v4i32)reg0); | 195 src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 148 dst2 = (v16u8)__msa_ilvr_w((v4i32)reg3, (v4i32)reg2); | 196 s += src_stride; |
| 149 dst3 = (v16u8)__msa_ilvl_w((v4i32)reg3, (v4i32)reg2); | 197 src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 150 dst4 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg4); | 198 s += src_stride; |
| 151 dst5 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg4); | 199 src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 152 dst6 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg6); | 200 s += src_stride; |
| 153 dst7 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg6); | 201 ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); |
| 154 val0 = __msa_copy_s_d((v2i64)dst0, 0); | 202 ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); |
| 155 val1 = __msa_copy_s_d((v2i64)dst0, 1); | 203 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 156 val2 = __msa_copy_s_d((v2i64)dst1, 0); | 204 s += src_stride; |
| 157 val3 = __msa_copy_s_d((v2i64)dst1, 1); | 205 src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 158 SD(val0, dst_a); | 206 s += src_stride; |
| 159 SD(val2, dst_a + dst_stride_a); | 207 src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 160 SD(val1, dst_b); | 208 s += src_stride; |
| 161 SD(val3, dst_b + dst_stride_b); | 209 src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
| 162 dst_a += dst_stride_a * 2; | 210 s += src_stride; |
| 163 dst_b += dst_stride_b * 2; | 211 ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); |
| 164 val0 = __msa_copy_s_d((v2i64)dst2, 0); | 212 ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); |
| 165 val1 = __msa_copy_s_d((v2i64)dst2, 1); | 213 res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0); |
| 166 val2 = __msa_copy_s_d((v2i64)dst3, 0); | 214 res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0); |
| 167 val3 = __msa_copy_s_d((v2i64)dst3, 1); | 215 ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3); |
| 168 SD(val0, dst_a); | 216 ST_UB2(dst0, dst2, dst_a, dst_stride_a); |
| 169 SD(val2, dst_a + dst_stride_a); | 217 ST_UB2(dst1, dst3, dst_b, dst_stride_b); |
| 170 SD(val1, dst_b); | 218 dst_a += dst_stride_a * 2; |
| 171 SD(val3, dst_b + dst_stride_b); | 219 dst_b += dst_stride_b * 2; |
| 172 dst_a += dst_stride_a * 2; | 220 res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1); |
| 173 dst_b += dst_stride_b * 2; | 221 res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1); |
| 174 val0 = __msa_copy_s_d((v2i64)dst4, 0); | 222 ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3); |
| 175 val1 = __msa_copy_s_d((v2i64)dst4, 1); | 223 ST_UB2(dst0, dst2, dst_a, dst_stride_a); |
| 176 val2 = __msa_copy_s_d((v2i64)dst5, 0); | 224 ST_UB2(dst1, dst3, dst_b, dst_stride_b); |
| 177 val3 = __msa_copy_s_d((v2i64)dst5, 1); | 225 dst_a += dst_stride_a * 2; |
| 178 SD(val0, dst_a); | 226 dst_b += dst_stride_b * 2; |
| 179 SD(val2, dst_a + dst_stride_a); | 227 res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2); |
| 180 SD(val1, dst_b); | 228 res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2); |
| 181 SD(val3, dst_b + dst_stride_b); | 229 ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3); |
| 182 dst_a += dst_stride_a * 2; | 230 ST_UB2(dst0, dst2, dst_a, dst_stride_a); |
| 183 dst_b += dst_stride_b * 2; | 231 ST_UB2(dst1, dst3, dst_b, dst_stride_b); |
| 184 val0 = __msa_copy_s_d((v2i64)dst6, 0); | 232 dst_a += dst_stride_a * 2; |
| 185 val1 = __msa_copy_s_d((v2i64)dst6, 1); | 233 dst_b += dst_stride_b * 2; |
| 186 val2 = __msa_copy_s_d((v2i64)dst7, 0); | 234 res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3); |
| 187 val3 = __msa_copy_s_d((v2i64)dst7, 1); | 235 res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3); |
| 188 SD(val0, dst_a); | 236 ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3); |
| 189 SD(val2, dst_a + dst_stride_a); | 237 ST_UB2(dst0, dst2, dst_a, dst_stride_a); |
| 190 SD(val1, dst_b); | 238 ST_UB2(dst1, dst3, dst_b, dst_stride_b); |
| 191 SD(val3, dst_b + dst_stride_b); | |
| 192 dst_a += dst_stride_a * 2; | |
| 193 dst_b += dst_stride_b * 2; | |
| 194 src += 16; | 239 src += 16; |
| 195 } | 240 dst_a += dst_stride_a * 2; |
| 196 } | 241 dst_b += dst_stride_b * 2; |
| 197 | 242 } |
| 243 } |
| 244 |
| 198 #ifdef __cplusplus | 245 #ifdef __cplusplus |
| 199 } // extern "C" | 246 } // extern "C" |
| 200 } // namespace libyuv | 247 } // namespace libyuv |
| 201 #endif | 248 #endif |
| 202 | 249 |
| 203 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) | 250 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) |
| OLD | NEW |