OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include "libyuv/rotate_row.h" | 11 #include "libyuv/rotate_row.h" |
12 | 12 |
13 // This module is for GCC MSA | 13 // This module is for GCC MSA |
14 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) | 14 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) |
15 #include "libyuv/macros_msa.h" | 15 #include "libyuv/macros_msa.h" |
16 | 16 |
17 #ifdef __cplusplus | 17 #ifdef __cplusplus |
18 namespace libyuv { | 18 namespace libyuv { |
19 extern "C" { | 19 extern "C" { |
20 #endif | 20 #endif |
21 | 21 |
22 void TransposeWx8_MSA(const uint8_t* src, | 22 #define ILVRL_B(in0, in1, in2, in3, out0, out1, out2, out3) \ |
23 int src_stride, | 23 { \ |
24 uint8_t* dst, | 24 out0 = (v16u8)__msa_ilvr_b((v16i8)in1, (v16i8)in0); \ |
25 int dst_stride, | 25 out1 = (v16u8)__msa_ilvl_b((v16i8)in1, (v16i8)in0); \ |
26 int width) { | 26 out2 = (v16u8)__msa_ilvr_b((v16i8)in3, (v16i8)in2); \ |
| 27 out3 = (v16u8)__msa_ilvl_b((v16i8)in3, (v16i8)in2); \ |
| 28 } |
| 29 |
| 30 #define ILVRL_H(in0, in1, in2, in3, out0, out1, out2, out3) \ |
| 31 { \ |
| 32 out0 = (v16u8)__msa_ilvr_h((v8i16)in1, (v8i16)in0); \ |
| 33 out1 = (v16u8)__msa_ilvl_h((v8i16)in1, (v8i16)in0); \ |
| 34 out2 = (v16u8)__msa_ilvr_h((v8i16)in3, (v8i16)in2); \ |
| 35 out3 = (v16u8)__msa_ilvl_h((v8i16)in3, (v8i16)in2); \ |
| 36 } |
| 37 |
| 38 #define ILVRL_W(in0, in1, in2, in3, out0, out1, out2, out3) \ |
| 39 { \ |
| 40 out0 = (v16u8)__msa_ilvr_w((v4i32)in1, (v4i32)in0); \ |
| 41 out1 = (v16u8)__msa_ilvl_w((v4i32)in1, (v4i32)in0); \ |
| 42 out2 = (v16u8)__msa_ilvr_w((v4i32)in3, (v4i32)in2); \ |
| 43 out3 = (v16u8)__msa_ilvl_w((v4i32)in3, (v4i32)in2); \ |
| 44 } |
| 45 |
| 46 #define ILVRL_D(in0, in1, in2, in3, out0, out1, out2, out3) \ |
| 47 { \ |
| 48 out0 = (v16u8)__msa_ilvr_d((v2i64)in1, (v2i64)in0); \ |
| 49 out1 = (v16u8)__msa_ilvl_d((v2i64)in1, (v2i64)in0); \ |
| 50 out2 = (v16u8)__msa_ilvr_d((v2i64)in3, (v2i64)in2); \ |
| 51 out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2); \ |
| 52 } |
| 53 |
| 54 void TransposeWx16_C(const uint8* src, |
| 55 int src_stride, |
| 56 uint8* dst, |
| 57 int dst_stride, |
| 58 int width) { |
| 59 TransposeWx8_C(src, src_stride, dst, dst_stride, width); |
| 60 TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride, |
| 61 width); |
| 62 } |
| 63 |
| 64 void TransposeUVWx16_C(const uint8* src, |
| 65 int src_stride, |
| 66 uint8* dst_a, |
| 67 int dst_stride_a, |
| 68 uint8* dst_b, |
| 69 int dst_stride_b, |
| 70 int width) { |
| 71 TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, |
| 72 width); |
| 73 TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8), |
| 74 dst_stride_a, (dst_b + 8), dst_stride_b, width); |
| 75 } |
| 76 |
| 77 void TransposeWx16_MSA(const uint8* src, |
| 78 int src_stride, |
| 79 uint8* dst, |
| 80 int dst_stride, |
| 81 int width) { |
27 int x; | 82 int x; |
28 uint64_t val0, val1, val2, val3; | 83 const uint8* s; |
29 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; | 84 v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; |
30 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; | |
31 v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; | 85 v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; |
32 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; | 86 v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9; |
33 | 87 |
34 for (x = 0; x < width; x += 16) { | 88 for (x = 0; x < width; x += 16) { |
35 src0 = (v16u8)__msa_ld_b((v16i8*)src, 0); | 89 s = src; |
36 src1 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride), 0); | 90 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
37 src2 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 2), 0); | 91 s += src_stride; |
38 src3 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 3), 0); | 92 src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
39 src4 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 4), 0); | 93 s += src_stride; |
40 src5 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 5), 0); | 94 src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
41 src6 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 6), 0); | 95 s += src_stride; |
42 src7 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 7), 0); | 96 src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
43 vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0); | 97 s += src_stride; |
44 vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1); | 98 ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); |
45 vec2 = (v16u8)__msa_ilvr_b((v16i8)src6, (v16i8)src4); | 99 ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); |
46 vec3 = (v16u8)__msa_ilvr_b((v16i8)src7, (v16i8)src5); | 100 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
47 vec4 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0); | 101 s += src_stride; |
48 vec5 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1); | 102 src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
49 vec6 = (v16u8)__msa_ilvl_b((v16i8)src6, (v16i8)src4); | 103 s += src_stride; |
50 vec7 = (v16u8)__msa_ilvl_b((v16i8)src7, (v16i8)src5); | 104 src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
51 reg0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0); | 105 s += src_stride; |
52 reg1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0); | 106 src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
53 reg2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec2); | 107 s += src_stride; |
54 reg3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec2); | 108 ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); |
55 reg4 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4); | 109 ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); |
56 reg5 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4); | 110 ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3); |
57 reg6 = (v16u8)__msa_ilvr_b((v16i8)vec7, (v16i8)vec6); | 111 ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7); |
58 reg7 = (v16u8)__msa_ilvl_b((v16i8)vec7, (v16i8)vec6); | 112 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
59 dst0 = (v16u8)__msa_ilvr_w((v4i32)reg2, (v4i32)reg0); | 113 s += src_stride; |
60 dst1 = (v16u8)__msa_ilvl_w((v4i32)reg2, (v4i32)reg0); | 114 src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
61 dst2 = (v16u8)__msa_ilvr_w((v4i32)reg3, (v4i32)reg1); | 115 s += src_stride; |
62 dst3 = (v16u8)__msa_ilvl_w((v4i32)reg3, (v4i32)reg1); | 116 src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
63 dst4 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg4); | 117 s += src_stride; |
64 dst5 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg4); | 118 src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
65 dst6 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg5); | 119 s += src_stride; |
66 dst7 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg5); | 120 ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); |
67 val0 = __msa_copy_s_d((v2i64)dst0, 0); | 121 ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); |
68 val1 = __msa_copy_s_d((v2i64)dst0, 1); | 122 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
69 val2 = __msa_copy_s_d((v2i64)dst1, 0); | 123 s += src_stride; |
70 val3 = __msa_copy_s_d((v2i64)dst1, 1); | 124 src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
71 SD(val0, dst); | 125 s += src_stride; |
72 SD(val1, dst + dst_stride); | 126 src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
73 SD(val2, dst + dst_stride * 2); | 127 s += src_stride; |
74 SD(val3, dst + dst_stride * 3); | 128 src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
75 dst += dst_stride * 4; | 129 s += src_stride; |
76 val0 = __msa_copy_s_d((v2i64)dst2, 0); | 130 ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); |
77 val1 = __msa_copy_s_d((v2i64)dst2, 1); | 131 ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); |
78 val2 = __msa_copy_s_d((v2i64)dst3, 0); | 132 res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0); |
79 val3 = __msa_copy_s_d((v2i64)dst3, 1); | 133 res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0); |
80 SD(val0, dst); | 134 ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3); |
81 SD(val1, dst + dst_stride); | 135 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); |
82 SD(val2, dst + dst_stride * 2); | 136 dst += dst_stride * 4; |
83 SD(val3, dst + dst_stride * 3); | 137 res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1); |
84 dst += dst_stride * 4; | 138 res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1); |
85 val0 = __msa_copy_s_d((v2i64)dst4, 0); | 139 ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3); |
86 val1 = __msa_copy_s_d((v2i64)dst4, 1); | 140 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); |
87 val2 = __msa_copy_s_d((v2i64)dst5, 0); | 141 dst += dst_stride * 4; |
88 val3 = __msa_copy_s_d((v2i64)dst5, 1); | 142 res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2); |
89 SD(val0, dst); | 143 res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2); |
90 SD(val1, dst + dst_stride); | 144 ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3); |
91 SD(val2, dst + dst_stride * 2); | 145 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); |
92 SD(val3, dst + dst_stride * 3); | 146 dst += dst_stride * 4; |
93 dst += dst_stride * 4; | 147 res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3); |
94 val0 = __msa_copy_s_d((v2i64)dst6, 0); | 148 res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3); |
95 val1 = __msa_copy_s_d((v2i64)dst6, 1); | 149 ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3); |
96 val2 = __msa_copy_s_d((v2i64)dst7, 0); | 150 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); |
97 val3 = __msa_copy_s_d((v2i64)dst7, 1); | |
98 SD(val0, dst); | |
99 SD(val1, dst + dst_stride); | |
100 SD(val2, dst + dst_stride * 2); | |
101 SD(val3, dst + dst_stride * 3); | |
102 dst += dst_stride * 4; | |
103 src += 16; | 151 src += 16; |
104 } | 152 dst += dst_stride * 4; |
105 } | 153 } |
106 | 154 } |
107 void TransposeUVWx8_MSA(const uint8_t* src, | 155 |
108 int src_stride, | 156 void TransposeUVWx16_MSA(const uint8* src, |
109 uint8_t* dst_a, | 157 int src_stride, |
110 int dst_stride_a, | 158 uint8* dst_a, |
111 uint8_t* dst_b, | 159 int dst_stride_a, |
112 int dst_stride_b, | 160 uint8* dst_b, |
113 int width) { | 161 int dst_stride_b, |
| 162 int width) { |
114 int x; | 163 int x; |
115 uint64_t val0, val1, val2, val3; | 164 const uint8* s; |
116 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; | 165 v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; |
117 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; | |
118 v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; | 166 v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; |
119 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; | 167 v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9; |
120 | 168 |
121 for (x = 0; x < width; x += 8) { | 169 for (x = 0; x < width; x += 8) { |
122 src0 = (v16u8)__msa_ld_b((v16i8*)src, 0); | 170 s = src; |
123 src1 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride), 0); | 171 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
124 src2 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 2), 0); | 172 s += src_stride; |
125 src3 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 3), 0); | 173 src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
126 src4 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 4), 0); | 174 s += src_stride; |
127 src5 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 5), 0); | 175 src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
128 src6 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 6), 0); | 176 s += src_stride; |
129 src7 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 7), 0); | 177 src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
130 vec0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0); | 178 s += src_stride; |
131 vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src2); | 179 ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); |
132 vec2 = (v16u8)__msa_ilvr_b((v16i8)src5, (v16i8)src4); | 180 ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); |
133 vec3 = (v16u8)__msa_ilvr_b((v16i8)src7, (v16i8)src6); | 181 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
134 vec4 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0); | 182 s += src_stride; |
135 vec5 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src2); | 183 src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
136 vec6 = (v16u8)__msa_ilvl_b((v16i8)src5, (v16i8)src4); | 184 s += src_stride; |
137 vec7 = (v16u8)__msa_ilvl_b((v16i8)src7, (v16i8)src6); | 185 src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
138 reg0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0); | 186 s += src_stride; |
139 reg1 = (v16u8)__msa_ilvr_h((v8i16)vec3, (v8i16)vec2); | 187 src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
140 reg2 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0); | 188 s += src_stride; |
141 reg3 = (v16u8)__msa_ilvl_h((v8i16)vec3, (v8i16)vec2); | 189 ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); |
142 reg4 = (v16u8)__msa_ilvr_h((v8i16)vec5, (v8i16)vec4); | 190 ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); |
143 reg5 = (v16u8)__msa_ilvr_h((v8i16)vec7, (v8i16)vec6); | 191 ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3); |
144 reg6 = (v16u8)__msa_ilvl_h((v8i16)vec5, (v8i16)vec5); | 192 ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7); |
145 reg7 = (v16u8)__msa_ilvl_h((v8i16)vec7, (v8i16)vec6); | 193 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
146 dst0 = (v16u8)__msa_ilvr_w((v4i32)reg1, (v4i32)reg0); | 194 s += src_stride; |
147 dst1 = (v16u8)__msa_ilvl_w((v4i32)reg1, (v4i32)reg0); | 195 src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
148 dst2 = (v16u8)__msa_ilvr_w((v4i32)reg3, (v4i32)reg2); | 196 s += src_stride; |
149 dst3 = (v16u8)__msa_ilvl_w((v4i32)reg3, (v4i32)reg2); | 197 src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
150 dst4 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg4); | 198 s += src_stride; |
151 dst5 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg4); | 199 src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
152 dst6 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg6); | 200 s += src_stride; |
153 dst7 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg6); | 201 ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); |
154 val0 = __msa_copy_s_d((v2i64)dst0, 0); | 202 ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); |
155 val1 = __msa_copy_s_d((v2i64)dst0, 1); | 203 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
156 val2 = __msa_copy_s_d((v2i64)dst1, 0); | 204 s += src_stride; |
157 val3 = __msa_copy_s_d((v2i64)dst1, 1); | 205 src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
158 SD(val0, dst_a); | 206 s += src_stride; |
159 SD(val2, dst_a + dst_stride_a); | 207 src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
160 SD(val1, dst_b); | 208 s += src_stride; |
161 SD(val3, dst_b + dst_stride_b); | 209 src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); |
162 dst_a += dst_stride_a * 2; | 210 s += src_stride; |
163 dst_b += dst_stride_b * 2; | 211 ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); |
164 val0 = __msa_copy_s_d((v2i64)dst2, 0); | 212 ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); |
165 val1 = __msa_copy_s_d((v2i64)dst2, 1); | 213 res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0); |
166 val2 = __msa_copy_s_d((v2i64)dst3, 0); | 214 res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0); |
167 val3 = __msa_copy_s_d((v2i64)dst3, 1); | 215 ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3); |
168 SD(val0, dst_a); | 216 ST_UB2(dst0, dst2, dst_a, dst_stride_a); |
169 SD(val2, dst_a + dst_stride_a); | 217 ST_UB2(dst1, dst3, dst_b, dst_stride_b); |
170 SD(val1, dst_b); | 218 dst_a += dst_stride_a * 2; |
171 SD(val3, dst_b + dst_stride_b); | 219 dst_b += dst_stride_b * 2; |
172 dst_a += dst_stride_a * 2; | 220 res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1); |
173 dst_b += dst_stride_b * 2; | 221 res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1); |
174 val0 = __msa_copy_s_d((v2i64)dst4, 0); | 222 ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3); |
175 val1 = __msa_copy_s_d((v2i64)dst4, 1); | 223 ST_UB2(dst0, dst2, dst_a, dst_stride_a); |
176 val2 = __msa_copy_s_d((v2i64)dst5, 0); | 224 ST_UB2(dst1, dst3, dst_b, dst_stride_b); |
177 val3 = __msa_copy_s_d((v2i64)dst5, 1); | 225 dst_a += dst_stride_a * 2; |
178 SD(val0, dst_a); | 226 dst_b += dst_stride_b * 2; |
179 SD(val2, dst_a + dst_stride_a); | 227 res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2); |
180 SD(val1, dst_b); | 228 res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2); |
181 SD(val3, dst_b + dst_stride_b); | 229 ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3); |
182 dst_a += dst_stride_a * 2; | 230 ST_UB2(dst0, dst2, dst_a, dst_stride_a); |
183 dst_b += dst_stride_b * 2; | 231 ST_UB2(dst1, dst3, dst_b, dst_stride_b); |
184 val0 = __msa_copy_s_d((v2i64)dst6, 0); | 232 dst_a += dst_stride_a * 2; |
185 val1 = __msa_copy_s_d((v2i64)dst6, 1); | 233 dst_b += dst_stride_b * 2; |
186 val2 = __msa_copy_s_d((v2i64)dst7, 0); | 234 res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3); |
187 val3 = __msa_copy_s_d((v2i64)dst7, 1); | 235 res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3); |
188 SD(val0, dst_a); | 236 ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3); |
189 SD(val2, dst_a + dst_stride_a); | 237 ST_UB2(dst0, dst2, dst_a, dst_stride_a); |
190 SD(val1, dst_b); | 238 ST_UB2(dst1, dst3, dst_b, dst_stride_b); |
191 SD(val3, dst_b + dst_stride_b); | |
192 dst_a += dst_stride_a * 2; | |
193 dst_b += dst_stride_b * 2; | |
194 src += 16; | 239 src += 16; |
195 } | 240 dst_a += dst_stride_a * 2; |
196 } | 241 dst_b += dst_stride_b * 2; |
197 | 242 } |
| 243 } |
| 244 |
198 #ifdef __cplusplus | 245 #ifdef __cplusplus |
199 } // extern "C" | 246 } // extern "C" |
200 } // namespace libyuv | 247 } // namespace libyuv |
201 #endif | 248 #endif |
202 | 249 |
203 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) | 250 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) |
OLD | NEW |