OLD | NEW |
---|---|
(Empty) | |
1 /* | |
2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. | |
3 * | |
4 * Use of this source code is governed by a BSD-style license | |
5 * that can be found in the LICENSE file in the root of the source | |
6 * tree. An additional intellectual property rights grant can be found | |
7 * in the file PATENTS. All contributing project authors may | |
8 * be found in the AUTHORS file in the root of the source tree. | |
9 */ | |
10 | |
11 #include "libyuv/rotate_row.h" | |
12 | |
13 // This module is for GCC MSA | |
14 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) | |
15 #include "libyuv/macros_msa.h" | |
16 | |
17 #ifdef __cplusplus | |
18 namespace libyuv { | |
19 extern "C" { | |
20 #endif | |
21 | |
22 void TransposeWx8_MSA(const uint8_t* src, | |
23 int src_stride, | |
24 uint8_t* dst, | |
25 int dst_stride, | |
26 int width) { | |
27 int x; | |
28 uint64_t val0, val1, val2, val3; | |
29 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; | |
30 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; | |
31 v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; | |
32 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; | |
33 | |
34 for (x = 0; x < width; x += 16) { | |
35 src0 = (v16u8)__msa_ld_b((v16i8*)src, 0); | |
36 src1 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride), 0); | |
37 src2 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 2), 0); | |
38 src3 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 3), 0); | |
39 src4 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 4), 0); | |
40 src5 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 5), 0); | |
41 src6 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 6), 0); | |
42 src7 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 7), 0); | |
43 vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0); | |
44 vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1); | |
45 vec2 = (v16u8)__msa_ilvr_b((v16i8)src6, (v16i8)src4); | |
46 vec3 = (v16u8)__msa_ilvr_b((v16i8)src7, (v16i8)src5); | |
47 vec4 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0); | |
48 vec5 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1); | |
49 vec6 = (v16u8)__msa_ilvl_b((v16i8)src6, (v16i8)src4); | |
50 vec7 = (v16u8)__msa_ilvl_b((v16i8)src7, (v16i8)src5); | |
51 reg0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0); | |
52 reg1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0); | |
53 reg2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec2); | |
54 reg3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec2); | |
55 reg4 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4); | |
56 reg5 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4); | |
57 reg6 = (v16u8)__msa_ilvr_b((v16i8)vec7, (v16i8)vec6); | |
58 reg7 = (v16u8)__msa_ilvl_b((v16i8)vec7, (v16i8)vec6); | |
59 dst0 = (v16u8)__msa_ilvr_w((v4i32)reg2, (v4i32)reg0); | |
60 dst1 = (v16u8)__msa_ilvl_w((v4i32)reg2, (v4i32)reg0); | |
61 dst2 = (v16u8)__msa_ilvr_w((v4i32)reg3, (v4i32)reg1); | |
62 dst3 = (v16u8)__msa_ilvl_w((v4i32)reg3, (v4i32)reg1); | |
63 dst4 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg4); | |
64 dst5 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg4); | |
65 dst6 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg5); | |
66 dst7 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg5); | |
67 val0 = __msa_copy_s_d((v2i64)dst0, 0); | |
fbarchard1
2016/12/12 19:59:19
consider if this can be weaved back into simd regi
manojkumar.bhosale
2016/12/13 08:52:23
As each double word store is offset with dst_strid
fbarchard1
2016/12/13 18:32:03
Acknowledged.
msa can't store 8 bytes directly fro
| |
68 val1 = __msa_copy_s_d((v2i64)dst0, 1); | |
69 val2 = __msa_copy_s_d((v2i64)dst1, 0); | |
70 val3 = __msa_copy_s_d((v2i64)dst1, 1); | |
71 SD(val0, dst); | |
72 SD(val1, dst + dst_stride); | |
73 SD(val2, dst + dst_stride * 2); | |
74 SD(val3, dst + dst_stride * 3); | |
75 dst += dst_stride * 4; | |
76 val0 = __msa_copy_s_d((v2i64)dst2, 0); | |
77 val1 = __msa_copy_s_d((v2i64)dst2, 1); | |
78 val2 = __msa_copy_s_d((v2i64)dst3, 0); | |
79 val3 = __msa_copy_s_d((v2i64)dst3, 1); | |
80 SD(val0, dst); | |
81 SD(val1, dst + dst_stride); | |
82 SD(val2, dst + dst_stride * 2); | |
83 SD(val3, dst + dst_stride * 3); | |
84 dst += dst_stride * 4; | |
85 val0 = __msa_copy_s_d((v2i64)dst4, 0); | |
86 val1 = __msa_copy_s_d((v2i64)dst4, 1); | |
87 val2 = __msa_copy_s_d((v2i64)dst5, 0); | |
88 val3 = __msa_copy_s_d((v2i64)dst5, 1); | |
89 SD(val0, dst); | |
90 SD(val1, dst + dst_stride); | |
91 SD(val2, dst + dst_stride * 2); | |
92 SD(val3, dst + dst_stride * 3); | |
93 dst += dst_stride * 4; | |
94 val0 = __msa_copy_s_d((v2i64)dst6, 0); | |
95 val1 = __msa_copy_s_d((v2i64)dst6, 1); | |
96 val2 = __msa_copy_s_d((v2i64)dst7, 0); | |
97 val3 = __msa_copy_s_d((v2i64)dst7, 1); | |
98 SD(val0, dst); | |
99 SD(val1, dst + dst_stride); | |
100 SD(val2, dst + dst_stride * 2); | |
101 SD(val3, dst + dst_stride * 3); | |
102 dst += dst_stride * 4; | |
103 src += 16; | |
104 } | |
105 } | |
106 | |
107 void TransposeUVWx8_MSA(const uint8_t* src, | |
108 int src_stride, | |
109 uint8_t* dst_a, | |
110 int dst_stride_a, | |
111 uint8_t* dst_b, | |
112 int dst_stride_b, | |
113 int width) { | |
114 int x; | |
115 uint64_t val0, val1, val2, val3; | |
116 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; | |
117 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; | |
118 v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; | |
119 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; | |
120 | |
121 for (x = 0; x < width; x += 8) { | |
122 src0 = (v16u8)__msa_ld_b((v16i8*)src, 0); | |
123 src1 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride), 0); | |
124 src2 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 2), 0); | |
125 src3 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 3), 0); | |
126 src4 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 4), 0); | |
127 src5 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 5), 0); | |
128 src6 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 6), 0); | |
129 src7 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 7), 0); | |
130 vec0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0); | |
131 vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src2); | |
132 vec2 = (v16u8)__msa_ilvr_b((v16i8)src5, (v16i8)src4); | |
133 vec3 = (v16u8)__msa_ilvr_b((v16i8)src7, (v16i8)src6); | |
134 vec4 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0); | |
135 vec5 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src2); | |
136 vec6 = (v16u8)__msa_ilvl_b((v16i8)src5, (v16i8)src4); | |
137 vec7 = (v16u8)__msa_ilvl_b((v16i8)src7, (v16i8)src6); | |
138 reg0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0); | |
139 reg1 = (v16u8)__msa_ilvr_h((v8i16)vec3, (v8i16)vec2); | |
140 reg2 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0); | |
141 reg3 = (v16u8)__msa_ilvl_h((v8i16)vec3, (v8i16)vec2); | |
142 reg4 = (v16u8)__msa_ilvr_h((v8i16)vec5, (v8i16)vec4); | |
143 reg5 = (v16u8)__msa_ilvr_h((v8i16)vec7, (v8i16)vec6); | |
144 reg6 = (v16u8)__msa_ilvl_h((v8i16)vec5, (v8i16)vec5); | |
145 reg7 = (v16u8)__msa_ilvl_h((v8i16)vec7, (v8i16)vec6); | |
146 dst0 = (v16u8)__msa_ilvr_w((v4i32)reg1, (v4i32)reg0); | |
147 dst1 = (v16u8)__msa_ilvl_w((v4i32)reg1, (v4i32)reg0); | |
148 dst2 = (v16u8)__msa_ilvr_w((v4i32)reg3, (v4i32)reg2); | |
149 dst3 = (v16u8)__msa_ilvl_w((v4i32)reg3, (v4i32)reg2); | |
150 dst4 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg4); | |
151 dst5 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg4); | |
152 dst6 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg6); | |
153 dst7 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg6); | |
154 val0 = __msa_copy_s_d((v2i64)dst0, 0); | |
155 val1 = __msa_copy_s_d((v2i64)dst0, 1); | |
156 val2 = __msa_copy_s_d((v2i64)dst1, 0); | |
157 val3 = __msa_copy_s_d((v2i64)dst1, 1); | |
158 SD(val0, dst_a); | |
159 SD(val2, dst_a + dst_stride_a); | |
160 SD(val1, dst_b); | |
161 SD(val3, dst_b + dst_stride_b); | |
162 dst_a += dst_stride_a * 2; | |
163 dst_b += dst_stride_b * 2; | |
164 val0 = __msa_copy_s_d((v2i64)dst2, 0); | |
165 val1 = __msa_copy_s_d((v2i64)dst2, 1); | |
166 val2 = __msa_copy_s_d((v2i64)dst3, 0); | |
167 val3 = __msa_copy_s_d((v2i64)dst3, 1); | |
168 SD(val0, dst_a); | |
169 SD(val2, dst_a + dst_stride_a); | |
170 SD(val1, dst_b); | |
171 SD(val3, dst_b + dst_stride_b); | |
172 dst_a += dst_stride_a * 2; | |
173 dst_b += dst_stride_b * 2; | |
174 val0 = __msa_copy_s_d((v2i64)dst4, 0); | |
175 val1 = __msa_copy_s_d((v2i64)dst4, 1); | |
176 val2 = __msa_copy_s_d((v2i64)dst5, 0); | |
177 val3 = __msa_copy_s_d((v2i64)dst5, 1); | |
178 SD(val0, dst_a); | |
179 SD(val2, dst_a + dst_stride_a); | |
180 SD(val1, dst_b); | |
181 SD(val3, dst_b + dst_stride_b); | |
182 dst_a += dst_stride_a * 2; | |
183 dst_b += dst_stride_b * 2; | |
184 val0 = __msa_copy_s_d((v2i64)dst6, 0); | |
185 val1 = __msa_copy_s_d((v2i64)dst6, 1); | |
186 val2 = __msa_copy_s_d((v2i64)dst7, 0); | |
187 val3 = __msa_copy_s_d((v2i64)dst7, 1); | |
188 SD(val0, dst_a); | |
189 SD(val2, dst_a + dst_stride_a); | |
190 SD(val1, dst_b); | |
191 SD(val3, dst_b + dst_stride_b); | |
192 dst_a += dst_stride_a * 2; | |
193 dst_b += dst_stride_b * 2; | |
194 src += 16; | |
195 } | |
196 } | |
197 | |
198 #ifdef __cplusplus | |
199 } // extern "C" | |
200 } // namespace libyuv | |
201 #endif | |
202 | |
203 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) | |
OLD | NEW |