Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(36)

Side by Side Diff: source/rotate_msa.cc

Issue 2617703002: Add MSA optimized rotate functions (used 16x16 transpose) (Closed)
Patch Set: correct file mode Created 3 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/rotate_any.cc ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2016 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include "libyuv/rotate_row.h" 11 #include "libyuv/rotate_row.h"
12 12
13 // This module is for GCC MSA 13 // This module is for GCC MSA
14 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) 14 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
15 #include "libyuv/macros_msa.h" 15 #include "libyuv/macros_msa.h"
16 16
17 #ifdef __cplusplus 17 #ifdef __cplusplus
18 namespace libyuv { 18 namespace libyuv {
19 extern "C" { 19 extern "C" {
20 #endif 20 #endif
21 21
22 void TransposeWx8_MSA(const uint8_t* src, 22 #define ILVRL_B(in0, in1, in2, in3, out0, out1, out2, out3) \
23 int src_stride, 23 { \
24 uint8_t* dst, 24 out0 = (v16u8)__msa_ilvr_b((v16i8)in1, (v16i8)in0); \
25 int dst_stride, 25 out1 = (v16u8)__msa_ilvl_b((v16i8)in1, (v16i8)in0); \
26 int width) { 26 out2 = (v16u8)__msa_ilvr_b((v16i8)in3, (v16i8)in2); \
27 out3 = (v16u8)__msa_ilvl_b((v16i8)in3, (v16i8)in2); \
28 }
29
30 #define ILVRL_H(in0, in1, in2, in3, out0, out1, out2, out3) \
31 { \
32 out0 = (v16u8)__msa_ilvr_h((v8i16)in1, (v8i16)in0); \
33 out1 = (v16u8)__msa_ilvl_h((v8i16)in1, (v8i16)in0); \
34 out2 = (v16u8)__msa_ilvr_h((v8i16)in3, (v8i16)in2); \
35 out3 = (v16u8)__msa_ilvl_h((v8i16)in3, (v8i16)in2); \
36 }
37
38 #define ILVRL_W(in0, in1, in2, in3, out0, out1, out2, out3) \
39 { \
40 out0 = (v16u8)__msa_ilvr_w((v4i32)in1, (v4i32)in0); \
41 out1 = (v16u8)__msa_ilvl_w((v4i32)in1, (v4i32)in0); \
42 out2 = (v16u8)__msa_ilvr_w((v4i32)in3, (v4i32)in2); \
43 out3 = (v16u8)__msa_ilvl_w((v4i32)in3, (v4i32)in2); \
44 }
45
46 #define ILVRL_D(in0, in1, in2, in3, out0, out1, out2, out3) \
47 { \
48 out0 = (v16u8)__msa_ilvr_d((v2i64)in1, (v2i64)in0); \
49 out1 = (v16u8)__msa_ilvl_d((v2i64)in1, (v2i64)in0); \
50 out2 = (v16u8)__msa_ilvr_d((v2i64)in3, (v2i64)in2); \
51 out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2); \
52 }
53
54 void TransposeWx16_C(const uint8* src,
55 int src_stride,
56 uint8* dst,
57 int dst_stride,
58 int width) {
59 TransposeWx8_C(src, src_stride, dst, dst_stride, width);
60 TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride,
61 width);
62 }
63
64 void TransposeUVWx16_C(const uint8* src,
65 int src_stride,
66 uint8* dst_a,
67 int dst_stride_a,
68 uint8* dst_b,
69 int dst_stride_b,
70 int width) {
71 TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
72 width);
73 TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8),
74 dst_stride_a, (dst_b + 8), dst_stride_b, width);
75 }
76
77 void TransposeWx16_MSA(const uint8* src,
78 int src_stride,
79 uint8* dst,
80 int dst_stride,
81 int width) {
27 int x; 82 int x;
28 uint64_t val0, val1, val2, val3; 83 const uint8* s;
29 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 84 v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
30 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
31 v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; 85 v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
32 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 86 v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
33 87
34 for (x = 0; x < width; x += 16) { 88 for (x = 0; x < width; x += 16) {
35 src0 = (v16u8)__msa_ld_b((v16i8*)src, 0); 89 s = src;
36 src1 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride), 0); 90 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
37 src2 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 2), 0); 91 s += src_stride;
38 src3 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 3), 0); 92 src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
39 src4 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 4), 0); 93 s += src_stride;
40 src5 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 5), 0); 94 src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
41 src6 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 6), 0); 95 s += src_stride;
42 src7 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 7), 0); 96 src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
43 vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0); 97 s += src_stride;
44 vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1); 98 ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
45 vec2 = (v16u8)__msa_ilvr_b((v16i8)src6, (v16i8)src4); 99 ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
46 vec3 = (v16u8)__msa_ilvr_b((v16i8)src7, (v16i8)src5); 100 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
47 vec4 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0); 101 s += src_stride;
48 vec5 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1); 102 src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
49 vec6 = (v16u8)__msa_ilvl_b((v16i8)src6, (v16i8)src4); 103 s += src_stride;
50 vec7 = (v16u8)__msa_ilvl_b((v16i8)src7, (v16i8)src5); 104 src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
51 reg0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0); 105 s += src_stride;
52 reg1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0); 106 src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
53 reg2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec2); 107 s += src_stride;
54 reg3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec2); 108 ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
55 reg4 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4); 109 ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
56 reg5 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4); 110 ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
57 reg6 = (v16u8)__msa_ilvr_b((v16i8)vec7, (v16i8)vec6); 111 ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
58 reg7 = (v16u8)__msa_ilvl_b((v16i8)vec7, (v16i8)vec6); 112 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
59 dst0 = (v16u8)__msa_ilvr_w((v4i32)reg2, (v4i32)reg0); 113 s += src_stride;
60 dst1 = (v16u8)__msa_ilvl_w((v4i32)reg2, (v4i32)reg0); 114 src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
61 dst2 = (v16u8)__msa_ilvr_w((v4i32)reg3, (v4i32)reg1); 115 s += src_stride;
62 dst3 = (v16u8)__msa_ilvl_w((v4i32)reg3, (v4i32)reg1); 116 src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
63 dst4 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg4); 117 s += src_stride;
64 dst5 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg4); 118 src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
65 dst6 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg5); 119 s += src_stride;
66 dst7 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg5); 120 ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
67 val0 = __msa_copy_s_d((v2i64)dst0, 0); 121 ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
68 val1 = __msa_copy_s_d((v2i64)dst0, 1); 122 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
69 val2 = __msa_copy_s_d((v2i64)dst1, 0); 123 s += src_stride;
70 val3 = __msa_copy_s_d((v2i64)dst1, 1); 124 src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
71 SD(val0, dst); 125 s += src_stride;
72 SD(val1, dst + dst_stride); 126 src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
73 SD(val2, dst + dst_stride * 2); 127 s += src_stride;
74 SD(val3, dst + dst_stride * 3); 128 src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
75 dst += dst_stride * 4; 129 s += src_stride;
76 val0 = __msa_copy_s_d((v2i64)dst2, 0); 130 ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
77 val1 = __msa_copy_s_d((v2i64)dst2, 1); 131 ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
78 val2 = __msa_copy_s_d((v2i64)dst3, 0); 132 res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0);
79 val3 = __msa_copy_s_d((v2i64)dst3, 1); 133 res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0);
80 SD(val0, dst); 134 ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
81 SD(val1, dst + dst_stride); 135 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
82 SD(val2, dst + dst_stride * 2); 136 dst += dst_stride * 4;
83 SD(val3, dst + dst_stride * 3); 137 res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1);
84 dst += dst_stride * 4; 138 res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1);
85 val0 = __msa_copy_s_d((v2i64)dst4, 0); 139 ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
86 val1 = __msa_copy_s_d((v2i64)dst4, 1); 140 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
87 val2 = __msa_copy_s_d((v2i64)dst5, 0); 141 dst += dst_stride * 4;
88 val3 = __msa_copy_s_d((v2i64)dst5, 1); 142 res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2);
89 SD(val0, dst); 143 res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2);
90 SD(val1, dst + dst_stride); 144 ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
91 SD(val2, dst + dst_stride * 2); 145 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
92 SD(val3, dst + dst_stride * 3); 146 dst += dst_stride * 4;
93 dst += dst_stride * 4; 147 res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3);
94 val0 = __msa_copy_s_d((v2i64)dst6, 0); 148 res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3);
95 val1 = __msa_copy_s_d((v2i64)dst6, 1); 149 ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
96 val2 = __msa_copy_s_d((v2i64)dst7, 0); 150 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
97 val3 = __msa_copy_s_d((v2i64)dst7, 1);
98 SD(val0, dst);
99 SD(val1, dst + dst_stride);
100 SD(val2, dst + dst_stride * 2);
101 SD(val3, dst + dst_stride * 3);
102 dst += dst_stride * 4;
103 src += 16; 151 src += 16;
104 } 152 dst += dst_stride * 4;
105 } 153 }
106 154 }
107 void TransposeUVWx8_MSA(const uint8_t* src, 155
108 int src_stride, 156 void TransposeUVWx16_MSA(const uint8* src,
109 uint8_t* dst_a, 157 int src_stride,
110 int dst_stride_a, 158 uint8* dst_a,
111 uint8_t* dst_b, 159 int dst_stride_a,
112 int dst_stride_b, 160 uint8* dst_b,
113 int width) { 161 int dst_stride_b,
162 int width) {
114 int x; 163 int x;
115 uint64_t val0, val1, val2, val3; 164 const uint8* s;
116 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 165 v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
117 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
118 v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; 166 v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
119 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; 167 v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
120 168
121 for (x = 0; x < width; x += 8) { 169 for (x = 0; x < width; x += 8) {
122 src0 = (v16u8)__msa_ld_b((v16i8*)src, 0); 170 s = src;
123 src1 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride), 0); 171 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
124 src2 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 2), 0); 172 s += src_stride;
125 src3 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 3), 0); 173 src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
126 src4 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 4), 0); 174 s += src_stride;
127 src5 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 5), 0); 175 src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
128 src6 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 6), 0); 176 s += src_stride;
129 src7 = (v16u8)__msa_ld_b((v16i8*)(src + src_stride * 7), 0); 177 src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
130 vec0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0); 178 s += src_stride;
131 vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src2); 179 ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
132 vec2 = (v16u8)__msa_ilvr_b((v16i8)src5, (v16i8)src4); 180 ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
133 vec3 = (v16u8)__msa_ilvr_b((v16i8)src7, (v16i8)src6); 181 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
134 vec4 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0); 182 s += src_stride;
135 vec5 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src2); 183 src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
136 vec6 = (v16u8)__msa_ilvl_b((v16i8)src5, (v16i8)src4); 184 s += src_stride;
137 vec7 = (v16u8)__msa_ilvl_b((v16i8)src7, (v16i8)src6); 185 src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
138 reg0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0); 186 s += src_stride;
139 reg1 = (v16u8)__msa_ilvr_h((v8i16)vec3, (v8i16)vec2); 187 src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
140 reg2 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0); 188 s += src_stride;
141 reg3 = (v16u8)__msa_ilvl_h((v8i16)vec3, (v8i16)vec2); 189 ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
142 reg4 = (v16u8)__msa_ilvr_h((v8i16)vec5, (v8i16)vec4); 190 ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
143 reg5 = (v16u8)__msa_ilvr_h((v8i16)vec7, (v8i16)vec6); 191 ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
144 reg6 = (v16u8)__msa_ilvl_h((v8i16)vec5, (v8i16)vec5); 192 ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
145 reg7 = (v16u8)__msa_ilvl_h((v8i16)vec7, (v8i16)vec6); 193 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
146 dst0 = (v16u8)__msa_ilvr_w((v4i32)reg1, (v4i32)reg0); 194 s += src_stride;
147 dst1 = (v16u8)__msa_ilvl_w((v4i32)reg1, (v4i32)reg0); 195 src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
148 dst2 = (v16u8)__msa_ilvr_w((v4i32)reg3, (v4i32)reg2); 196 s += src_stride;
149 dst3 = (v16u8)__msa_ilvl_w((v4i32)reg3, (v4i32)reg2); 197 src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
150 dst4 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg4); 198 s += src_stride;
151 dst5 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg4); 199 src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
152 dst6 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg6); 200 s += src_stride;
153 dst7 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg6); 201 ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
154 val0 = __msa_copy_s_d((v2i64)dst0, 0); 202 ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
155 val1 = __msa_copy_s_d((v2i64)dst0, 1); 203 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
156 val2 = __msa_copy_s_d((v2i64)dst1, 0); 204 s += src_stride;
157 val3 = __msa_copy_s_d((v2i64)dst1, 1); 205 src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
158 SD(val0, dst_a); 206 s += src_stride;
159 SD(val2, dst_a + dst_stride_a); 207 src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
160 SD(val1, dst_b); 208 s += src_stride;
161 SD(val3, dst_b + dst_stride_b); 209 src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
162 dst_a += dst_stride_a * 2; 210 s += src_stride;
163 dst_b += dst_stride_b * 2; 211 ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
164 val0 = __msa_copy_s_d((v2i64)dst2, 0); 212 ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
165 val1 = __msa_copy_s_d((v2i64)dst2, 1); 213 res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0);
166 val2 = __msa_copy_s_d((v2i64)dst3, 0); 214 res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0);
167 val3 = __msa_copy_s_d((v2i64)dst3, 1); 215 ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
168 SD(val0, dst_a); 216 ST_UB2(dst0, dst2, dst_a, dst_stride_a);
169 SD(val2, dst_a + dst_stride_a); 217 ST_UB2(dst1, dst3, dst_b, dst_stride_b);
170 SD(val1, dst_b); 218 dst_a += dst_stride_a * 2;
171 SD(val3, dst_b + dst_stride_b); 219 dst_b += dst_stride_b * 2;
172 dst_a += dst_stride_a * 2; 220 res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1);
173 dst_b += dst_stride_b * 2; 221 res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1);
174 val0 = __msa_copy_s_d((v2i64)dst4, 0); 222 ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
175 val1 = __msa_copy_s_d((v2i64)dst4, 1); 223 ST_UB2(dst0, dst2, dst_a, dst_stride_a);
176 val2 = __msa_copy_s_d((v2i64)dst5, 0); 224 ST_UB2(dst1, dst3, dst_b, dst_stride_b);
177 val3 = __msa_copy_s_d((v2i64)dst5, 1); 225 dst_a += dst_stride_a * 2;
178 SD(val0, dst_a); 226 dst_b += dst_stride_b * 2;
179 SD(val2, dst_a + dst_stride_a); 227 res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2);
180 SD(val1, dst_b); 228 res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2);
181 SD(val3, dst_b + dst_stride_b); 229 ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
182 dst_a += dst_stride_a * 2; 230 ST_UB2(dst0, dst2, dst_a, dst_stride_a);
183 dst_b += dst_stride_b * 2; 231 ST_UB2(dst1, dst3, dst_b, dst_stride_b);
184 val0 = __msa_copy_s_d((v2i64)dst6, 0); 232 dst_a += dst_stride_a * 2;
185 val1 = __msa_copy_s_d((v2i64)dst6, 1); 233 dst_b += dst_stride_b * 2;
186 val2 = __msa_copy_s_d((v2i64)dst7, 0); 234 res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3);
187 val3 = __msa_copy_s_d((v2i64)dst7, 1); 235 res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3);
188 SD(val0, dst_a); 236 ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
189 SD(val2, dst_a + dst_stride_a); 237 ST_UB2(dst0, dst2, dst_a, dst_stride_a);
190 SD(val1, dst_b); 238 ST_UB2(dst1, dst3, dst_b, dst_stride_b);
191 SD(val3, dst_b + dst_stride_b);
192 dst_a += dst_stride_a * 2;
193 dst_b += dst_stride_b * 2;
194 src += 16; 239 src += 16;
195 } 240 dst_a += dst_stride_a * 2;
196 } 241 dst_b += dst_stride_b * 2;
197 242 }
243 }
244
198 #ifdef __cplusplus 245 #ifdef __cplusplus
199 } // extern "C" 246 } // extern "C"
200 } // namespace libyuv 247 } // namespace libyuv
201 #endif 248 #endif
202 249
203 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) 250 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
OLDNEW
« no previous file with comments | « source/rotate_any.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698