Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(207)

Side by Side Diff: source/rotate_msa.cc

Issue 2553403002: Add MSA optimized TransposeWx8_MSA and TransposeUVWx8_MSA functions (Closed)
Patch Set: Changes as per review comments Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/rotate_any.cc ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 /*
2 * Copyright 2016 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/rotate_row.h"
12
13 // This module is for GCC MSA
14 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
15 #include "libyuv/macros_msa.h"
16
17 #ifdef __cplusplus
18 namespace libyuv {
19 extern "C" {
20 #endif
21
22 void TransposeWx8_MSA(const uint8_t* src,
23 int src_stride,
24 uint8_t* dst,
25 int dst_stride,
26 int width) {
27 int x;
28 uint64_t val0, val1, val2, val3;
29 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
30 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
31 v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
32 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
33
34 for (x = 0; x < width; x += 16) {
35 src0 = (v16u8)__msa_ld_b((v16i8*)src, 0);
36 src1 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride), 0);
37 src2 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 2), 0);
38 src3 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 3), 0);
39 src4 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 4), 0);
40 src5 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 5), 0);
41 src6 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 6), 0);
42 src7 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 7), 0);
43 vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
44 vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
45 vec2 = (v16u8)__msa_ilvr_b((v16i8)src6, (v16i8)src4);
46 vec3 = (v16u8)__msa_ilvr_b((v16i8)src7, (v16i8)src5);
47 vec4 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
48 vec5 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
49 vec6 = (v16u8)__msa_ilvl_b((v16i8)src6, (v16i8)src4);
50 vec7 = (v16u8)__msa_ilvl_b((v16i8)src7, (v16i8)src5);
51 reg0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0);
52 reg1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0);
53 reg2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec2);
54 reg3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec2);
55 reg4 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4);
56 reg5 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4);
57 reg6 = (v16u8)__msa_ilvr_b((v16i8)vec7, (v16i8)vec6);
58 reg7 = (v16u8)__msa_ilvl_b((v16i8)vec7, (v16i8)vec6);
59 dst0 = (v16u8)__msa_ilvr_w((v4i32)reg2, (v4i32)reg0);
60 dst1 = (v16u8)__msa_ilvl_w((v4i32)reg2, (v4i32)reg0);
61 dst2 = (v16u8)__msa_ilvr_w((v4i32)reg3, (v4i32)reg1);
62 dst3 = (v16u8)__msa_ilvl_w((v4i32)reg3, (v4i32)reg1);
63 dst4 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg4);
64 dst5 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg4);
65 dst6 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg5);
66 dst7 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg5);
67 val0 = __msa_copy_s_d((v2i64)dst0, 0);
68 val1 = __msa_copy_s_d((v2i64)dst0, 1);
69 val2 = __msa_copy_s_d((v2i64)dst1, 0);
70 val3 = __msa_copy_s_d((v2i64)dst1, 1);
71 SD(val0, dst);
72 SD(val1, dst + dst_stride);
73 SD(val2, dst + dst_stride * 2);
74 SD(val3, dst + dst_stride * 3);
75 dst += dst_stride * 4;
76 val0 = __msa_copy_s_d((v2i64)dst2, 0);
77 val1 = __msa_copy_s_d((v2i64)dst2, 1);
78 val2 = __msa_copy_s_d((v2i64)dst3, 0);
79 val3 = __msa_copy_s_d((v2i64)dst3, 1);
80 SD(val0, dst);
81 SD(val1, dst + dst_stride);
82 SD(val2, dst + dst_stride * 2);
83 SD(val3, dst + dst_stride * 3);
84 dst += dst_stride * 4;
85 val0 = __msa_copy_s_d((v2i64)dst4, 0);
86 val1 = __msa_copy_s_d((v2i64)dst4, 1);
87 val2 = __msa_copy_s_d((v2i64)dst5, 0);
88 val3 = __msa_copy_s_d((v2i64)dst5, 1);
89 SD(val0, dst);
90 SD(val1, dst + dst_stride);
91 SD(val2, dst + dst_stride * 2);
92 SD(val3, dst + dst_stride * 3);
93 dst += dst_stride * 4;
94 val0 = __msa_copy_s_d((v2i64)dst6, 0);
95 val1 = __msa_copy_s_d((v2i64)dst6, 1);
96 val2 = __msa_copy_s_d((v2i64)dst7, 0);
97 val3 = __msa_copy_s_d((v2i64)dst7, 1);
98 SD(val0, dst);
99 SD(val1, dst + dst_stride);
100 SD(val2, dst + dst_stride * 2);
101 SD(val3, dst + dst_stride * 3);
102 dst += dst_stride * 4;
103 src += 16;
104 }
105 }
106
107 void TransposeUVWx8_MSA(const uint8_t* src,
108 int src_stride,
109 uint8_t* dst_a,
110 int dst_stride_a,
111 uint8_t* dst_b,
112 int dst_stride_b,
113 int width) {
114 int x;
115 uint64_t val0, val1, val2, val3;
116 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
117 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
118 v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
119 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
120
121 for (x = 0; x < width; x += 8) {
122 src0 = (v16u8)__msa_ld_b((v16i8*)src, 0);
123 src1 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride), 0);
124 src2 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 2), 0);
125 src3 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 3), 0);
126 src4 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 4), 0);
127 src5 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 5), 0);
128 src6 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 6), 0);
129 src7 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 7), 0);
130 vec0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0);
131 vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src2);
132 vec2 = (v16u8)__msa_ilvr_b((v16i8)src5, (v16i8)src4);
133 vec3 = (v16u8)__msa_ilvr_b((v16i8)src7, (v16i8)src6);
134 vec4 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0);
135 vec5 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src2);
136 vec6 = (v16u8)__msa_ilvl_b((v16i8)src5, (v16i8)src4);
137 vec7 = (v16u8)__msa_ilvl_b((v16i8)src7, (v16i8)src6);
138 reg0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0);
139 reg1 = (v16u8)__msa_ilvr_h((v8i16)vec3, (v8i16)vec2);
140 reg2 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0);
141 reg3 = (v16u8)__msa_ilvl_h((v8i16)vec3, (v8i16)vec2);
142 reg4 = (v16u8)__msa_ilvr_h((v8i16)vec5, (v8i16)vec4);
143 reg5 = (v16u8)__msa_ilvr_h((v8i16)vec7, (v8i16)vec6);
144 reg6 = (v16u8)__msa_ilvl_h((v8i16)vec5, (v8i16)vec5);
145 reg7 = (v16u8)__msa_ilvl_h((v8i16)vec7, (v8i16)vec6);
146 dst0 = (v16u8)__msa_ilvr_w((v4i32)reg1, (v4i32)reg0);
147 dst1 = (v16u8)__msa_ilvl_w((v4i32)reg1, (v4i32)reg0);
148 dst2 = (v16u8)__msa_ilvr_w((v4i32)reg3, (v4i32)reg2);
149 dst3 = (v16u8)__msa_ilvl_w((v4i32)reg3, (v4i32)reg2);
150 dst4 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg4);
151 dst5 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg4);
152 dst6 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg6);
153 dst7 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg6);
154 val0 = __msa_copy_s_d((v2i64)dst0, 0);
155 val1 = __msa_copy_s_d((v2i64)dst0, 1);
156 val2 = __msa_copy_s_d((v2i64)dst1, 0);
157 val3 = __msa_copy_s_d((v2i64)dst1, 1);
158 SD(val0, dst_a);
159 SD(val2, dst_a + dst_stride_a);
160 SD(val1, dst_b);
161 SD(val3, dst_b + dst_stride_b);
162 dst_a += dst_stride_a * 2;
163 dst_b += dst_stride_b * 2;
164 val0 = __msa_copy_s_d((v2i64)dst2, 0);
165 val1 = __msa_copy_s_d((v2i64)dst2, 1);
166 val2 = __msa_copy_s_d((v2i64)dst3, 0);
167 val3 = __msa_copy_s_d((v2i64)dst3, 1);
168 SD(val0, dst_a);
169 SD(val2, dst_a + dst_stride_a);
170 SD(val1, dst_b);
171 SD(val3, dst_b + dst_stride_b);
172 dst_a += dst_stride_a * 2;
173 dst_b += dst_stride_b * 2;
174 val0 = __msa_copy_s_d((v2i64)dst4, 0);
175 val1 = __msa_copy_s_d((v2i64)dst4, 1);
176 val2 = __msa_copy_s_d((v2i64)dst5, 0);
177 val3 = __msa_copy_s_d((v2i64)dst5, 1);
178 SD(val0, dst_a);
179 SD(val2, dst_a + dst_stride_a);
180 SD(val1, dst_b);
181 SD(val3, dst_b + dst_stride_b);
182 dst_a += dst_stride_a * 2;
183 dst_b += dst_stride_b * 2;
184 val0 = __msa_copy_s_d((v2i64)dst6, 0);
185 val1 = __msa_copy_s_d((v2i64)dst6, 1);
186 val2 = __msa_copy_s_d((v2i64)dst7, 0);
187 val3 = __msa_copy_s_d((v2i64)dst7, 1);
188 SD(val0, dst_a);
189 SD(val2, dst_a + dst_stride_a);
190 SD(val1, dst_b);
191 SD(val3, dst_b + dst_stride_b);
192 dst_a += dst_stride_a * 2;
193 dst_b += dst_stride_b * 2;
194 src += 16;
195 }
196 }
197
198 #ifdef __cplusplus
199 } // extern "C"
200 } // namespace libyuv
201 #endif
202
203 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
OLDNEW
« no previous file with comments | « source/rotate_any.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698