Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(295)

Side by Side Diff: source/libvpx/vp9/common/mips/msa/vp9_convolve8_vert_msa.c

Issue 1169543007: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include "./vp9_rtcd.h" 11 #include "./vp9_rtcd.h"
12 #include "vp9/common/mips/msa/vp9_convolve_msa.h" 12 #include "vp9/common/mips/msa/vp9_convolve_msa.h"
13 13
14 static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, 14 static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
15 uint8_t *dst, int32_t dst_stride, 15 uint8_t *dst, int32_t dst_stride,
16 int8_t *filter, int32_t height) { 16 int8_t *filter, int32_t height) {
17 uint32_t loop_cnt; 17 uint32_t loop_cnt;
18 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 18 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
19 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 19 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
20 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 20 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
21 v16i8 src2110, src4332, src6554, src8776, src10998; 21 v16i8 src10998, filt0, filt1, filt2, filt3;
22 v16u8 out;
22 v8i16 filt, out10, out32; 23 v8i16 filt, out10, out32;
23 v16i8 filt0, filt1, filt2, filt3;
24 24
25 src -= (3 * src_stride); 25 src -= (3 * src_stride);
26 26
27 filt = LOAD_SH(filter); 27 filt = LD_SH(filter);
28 filt0 = (v16i8)__msa_splati_h(filt, 0); 28 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
29 filt1 = (v16i8)__msa_splati_h(filt, 1);
30 filt2 = (v16i8)__msa_splati_h(filt, 2);
31 filt3 = (v16i8)__msa_splati_h(filt, 3);
32 29
33 LOAD_7VECS_SB(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 30 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
34 src += (7 * src_stride); 31 src += (7 * src_stride);
35 32
36 ILVR_B_6VECS_SB(src0, src2, src4, src1, src3, src5, 33 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
37 src1, src3, src5, src2, src4, src6, 34 src54_r, src21_r);
38 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r); 35 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
39 36 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
40 ILVR_D_3VECS_SB(src2110, src21_r, src10_r, src4332, src43_r, src32_r, 37 src4332, src6554);
41 src6554, src65_r, src54_r); 38 XORI_B3_128_SB(src2110, src4332, src6554);
42
43 XORI_B_3VECS_SB(src2110, src4332, src6554, src2110, src4332, src6554, 128);
44 39
45 for (loop_cnt = (height >> 2); loop_cnt--;) { 40 for (loop_cnt = (height >> 2); loop_cnt--;) {
46 LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10); 41 LD_SB4(src, src_stride, src7, src8, src9, src10);
47 src += (4 * src_stride); 42 src += (4 * src_stride);
48 43
49 ILVR_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10, 44 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
50 src76_r, src87_r, src98_r, src109_r); 45 src87_r, src98_r, src109_r);
51 46 ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
52 ILVR_D_2VECS_SB(src8776, src87_r, src76_r, src10998, src109_r, src98_r); 47 XORI_B2_128_SB(src8776, src10998);
53 48 out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
54 XORI_B_2VECS_SB(src8776, src10998, src8776, src10998, 128); 49 filt1, filt2, filt3);
55 50 out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
56 out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, 51 filt1, filt2, filt3);
57 filt0, filt1, filt2, filt3); 52 SRARI_H2_SH(out10, out32, FILTER_BITS);
58 out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, 53 SAT_SH2_SH(out10, out32, 7);
59 filt0, filt1, filt2, filt3); 54 out = PCKEV_XORI128_UB(out10, out32);
60 55 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
61 out10 = SRARI_SATURATE_SIGNED_H(out10, FILTER_BITS, 7);
62 out32 = SRARI_SATURATE_SIGNED_H(out32, FILTER_BITS, 7);
63
64 PCKEV_2B_XORI128_STORE_4_BYTES_4(out10, out32, dst, dst_stride);
65 dst += (4 * dst_stride); 56 dst += (4 * dst_stride);
66 57
67 src2110 = src6554; 58 src2110 = src6554;
68 src4332 = src8776; 59 src4332 = src8776;
69 src6554 = src10998; 60 src6554 = src10998;
70
71 src6 = src10; 61 src6 = src10;
72 } 62 }
73 } 63 }
74 64
75 static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride, 65 static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride,
76 uint8_t *dst, int32_t dst_stride, 66 uint8_t *dst, int32_t dst_stride,
77 int8_t *filter, int32_t height) { 67 int8_t *filter, int32_t height) {
78 uint32_t loop_cnt; 68 uint32_t loop_cnt;
79 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 69 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
80 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 70 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
81 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 71 v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
82 v16i8 filt0, filt1, filt2, filt3; 72 v16u8 tmp0, tmp1;
83 v8i16 filt, out0_r, out1_r, out2_r, out3_r; 73 v8i16 filt, out0_r, out1_r, out2_r, out3_r;
84 74
85 src -= (3 * src_stride); 75 src -= (3 * src_stride);
86 76
87 filt = LOAD_SH(filter); 77 filt = LD_SH(filter);
88 filt0 = (v16i8)__msa_splati_h(filt, 0); 78 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
89 filt1 = (v16i8)__msa_splati_h(filt, 1);
90 filt2 = (v16i8)__msa_splati_h(filt, 2);
91 filt3 = (v16i8)__msa_splati_h(filt, 3);
92 79
93 LOAD_7VECS_SB(src, src_stride, src0, src1, src2, src3, src4, src5, src6); 80 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
81 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
94 src += (7 * src_stride); 82 src += (7 * src_stride);
95 83 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
96 XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6, 84 src54_r, src21_r);
97 src0, src1, src2, src3, src4, src5, src6, 128); 85 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
98
99 ILVR_B_6VECS_SB(src0, src2, src4, src1, src3, src5,
100 src1, src3, src5, src2, src4, src6,
101 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r);
102 86
103 for (loop_cnt = (height >> 2); loop_cnt--;) { 87 for (loop_cnt = (height >> 2); loop_cnt--;) {
104 LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10); 88 LD_SB4(src, src_stride, src7, src8, src9, src10);
89 XORI_B4_128_SB(src7, src8, src9, src10);
105 src += (4 * src_stride); 90 src += (4 * src_stride);
106 91
107 XORI_B_4VECS_SB(src7, src8, src9, src10, src7, src8, src9, src10, 128); 92 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
108 93 src87_r, src98_r, src109_r);
109 ILVR_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10, 94 out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
110 src76_r, src87_r, src98_r, src109_r); 95 filt1, filt2, filt3);
111 96 out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
112 out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, 97 filt1, filt2, filt3);
113 filt0, filt1, filt2, filt3); 98 out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
114 out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, 99 filt1, filt2, filt3);
115 filt0, filt1, filt2, filt3); 100 out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
116 out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, 101 filt1, filt2, filt3);
117 filt0, filt1, filt2, filt3); 102 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
118 out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, 103 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
119 filt0, filt1, filt2, filt3); 104 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
120 105 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
121 out0_r = SRARI_SATURATE_SIGNED_H(out0_r, FILTER_BITS, 7); 106 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
122 out1_r = SRARI_SATURATE_SIGNED_H(out1_r, FILTER_BITS, 7);
123 out2_r = SRARI_SATURATE_SIGNED_H(out2_r, FILTER_BITS, 7);
124 out3_r = SRARI_SATURATE_SIGNED_H(out3_r, FILTER_BITS, 7);
125
126 PCKEV_B_4_XORI128_STORE_8_BYTES_4(out0_r, out1_r, out2_r, out3_r,
127 dst, dst_stride);
128 dst += (4 * dst_stride); 107 dst += (4 * dst_stride);
129 108
130 src10_r = src54_r; 109 src10_r = src54_r;
110 src32_r = src76_r;
111 src54_r = src98_r;
112 src21_r = src65_r;
113 src43_r = src87_r;
114 src65_r = src109_r;
115 src6 = src10;
116 }
117 }
118
119 static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
120 uint8_t *dst, int32_t dst_stride,
121 int8_t *filter, int32_t height) {
122 uint32_t loop_cnt;
123 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
124 v16i8 filt0, filt1, filt2, filt3;
125 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
126 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
127 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
128 v16u8 tmp0, tmp1, tmp2, tmp3;
129 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
130
131 src -= (3 * src_stride);
132
133 filt = LD_SH(filter);
134 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
135
136 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
137 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
138 src += (7 * src_stride);
139 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
140 src54_r, src21_r);
141 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
142 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
143 src54_l, src21_l);
144 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
145
146 for (loop_cnt = (height >> 2); loop_cnt--;) {
147 LD_SB4(src, src_stride, src7, src8, src9, src10);
148 XORI_B4_128_SB(src7, src8, src9, src10);
149 src += (4 * src_stride);
150
151 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
152 src87_r, src98_r, src109_r);
153 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
154 src87_l, src98_l, src109_l);
155 out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
156 filt1, filt2, filt3);
157 out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
158 filt1, filt2, filt3);
159 out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
160 filt1, filt2, filt3);
161 out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
162 filt1, filt2, filt3);
163 out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
164 filt1, filt2, filt3);
165 out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
166 filt1, filt2, filt3);
167 out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
168 filt1, filt2, filt3);
169 out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
170 filt1, filt2, filt3);
171 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
172 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
173 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
174 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
175 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r,
176 tmp0, tmp1, tmp2, tmp3);
177 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
178 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
179 dst += (4 * dst_stride);
180
181 src10_r = src54_r;
131 src32_r = src76_r; 182 src32_r = src76_r;
132 src54_r = src98_r; 183 src54_r = src98_r;
133 src21_r = src65_r; 184 src21_r = src65_r;
134 src43_r = src87_r; 185 src43_r = src87_r;
135 src65_r = src109_r; 186 src65_r = src109_r;
136 187 src10_l = src54_l;
188 src32_l = src76_l;
189 src54_l = src98_l;
190 src21_l = src65_l;
191 src43_l = src87_l;
192 src65_l = src109_l;
137 src6 = src10; 193 src6 = src10;
138 } 194 }
139 } 195 }
140 196
141 static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride, 197 static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
142 uint8_t *dst, int32_t dst_stride, 198 uint8_t *dst, int32_t dst_stride,
143 int8_t *filter, int32_t height, 199 int8_t *filter, int32_t height,
144 int32_t width) { 200 int32_t width) {
145 const uint8_t *src_tmp; 201 const uint8_t *src_tmp;
146 uint8_t *dst_tmp; 202 uint8_t *dst_tmp;
147 uint32_t loop_cnt, cnt; 203 uint32_t loop_cnt, cnt;
148 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; 204 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
149 v16i8 filt0, filt1, filt2, filt3; 205 v16i8 filt0, filt1, filt2, filt3;
150 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r; 206 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
151 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r; 207 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
152 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l; 208 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
153 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
154 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
155 v8i16 filt;
156 v16u8 tmp0, tmp1, tmp2, tmp3; 209 v16u8 tmp0, tmp1, tmp2, tmp3;
210 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
157 211
158 src -= (3 * src_stride); 212 src -= (3 * src_stride);
159 213
160 filt = LOAD_SH(filter); 214 filt = LD_SH(filter);
161 filt0 = (v16i8)__msa_splati_h(filt, 0); 215 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
162 filt1 = (v16i8)__msa_splati_h(filt, 1);
163 filt2 = (v16i8)__msa_splati_h(filt, 2);
164 filt3 = (v16i8)__msa_splati_h(filt, 3);
165 216
166 for (cnt = (width >> 4); cnt--;) { 217 for (cnt = (width >> 4); cnt--;) {
167 src_tmp = src; 218 src_tmp = src;
168 dst_tmp = dst; 219 dst_tmp = dst;
169 220
170 LOAD_7VECS_SB(src_tmp, src_stride, 221 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
171 src0, src1, src2, src3, src4, src5, src6); 222 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
172 src_tmp += (7 * src_stride); 223 src_tmp += (7 * src_stride);
173 224 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
174 XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6, 225 src32_r, src54_r, src21_r);
175 src0, src1, src2, src3, src4, src5, src6, 128); 226 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
176 227 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
177 ILVR_B_6VECS_SB(src0, src2, src4, src1, src3, src5, 228 src32_l, src54_l, src21_l);
178 src1, src3, src5, src2, src4, src6, 229 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
179 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r);
180
181 ILVL_B_6VECS_SB(src0, src2, src4, src1, src3, src5,
182 src1, src3, src5, src2, src4, src6,
183 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l);
184 230
185 for (loop_cnt = (height >> 2); loop_cnt--;) { 231 for (loop_cnt = (height >> 2); loop_cnt--;) {
186 LOAD_4VECS_SB(src_tmp, src_stride, src7, src8, src9, src10); 232 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
233 XORI_B4_128_SB(src7, src8, src9, src10);
187 src_tmp += (4 * src_stride); 234 src_tmp += (4 * src_stride);
188 235 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
189 XORI_B_4VECS_SB(src7, src8, src9, src10, src7, src8, src9, src10, 128); 236 src87_r, src98_r, src109_r);
190 237 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
191 ILVR_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10, 238 src87_l, src98_l, src109_l);
192 src76_r, src87_r, src98_r, src109_r); 239 out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
193 240 filt1, filt2, filt3);
194 ILVL_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10, 241 out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
195 src76_l, src87_l, src98_l, src109_l); 242 filt1, filt2, filt3);
196 243 out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
197 out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, 244 filt1, filt2, filt3);
198 filt0, filt1, filt2, filt3); 245 out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
199 out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, 246 filt1, filt2, filt3);
200 filt0, filt1, filt2, filt3); 247 out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
201 out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, 248 filt1, filt2, filt3);
202 filt0, filt1, filt2, filt3); 249 out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
203 out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, 250 filt1, filt2, filt3);
204 filt0, filt1, filt2, filt3); 251 out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
205 252 filt1, filt2, filt3);
206 out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, 253 out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
207 filt0, filt1, filt2, filt3); 254 filt1, filt2, filt3);
208 out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, 255 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
209 filt0, filt1, filt2, filt3); 256 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
210 out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, 257 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
211 filt0, filt1, filt2, filt3); 258 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
212 out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, 259 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
213 filt0, filt1, filt2, filt3); 260 out3_r, tmp0, tmp1, tmp2, tmp3);
214 261 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
215 out0_r = SRARI_SATURATE_SIGNED_H(out0_r, FILTER_BITS, 7); 262 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
216 out1_r = SRARI_SATURATE_SIGNED_H(out1_r, FILTER_BITS, 7);
217 out2_r = SRARI_SATURATE_SIGNED_H(out2_r, FILTER_BITS, 7);
218 out3_r = SRARI_SATURATE_SIGNED_H(out3_r, FILTER_BITS, 7);
219 out0_l = SRARI_SATURATE_SIGNED_H(out0_l, FILTER_BITS, 7);
220 out1_l = SRARI_SATURATE_SIGNED_H(out1_l, FILTER_BITS, 7);
221 out2_l = SRARI_SATURATE_SIGNED_H(out2_l, FILTER_BITS, 7);
222 out3_l = SRARI_SATURATE_SIGNED_H(out3_l, FILTER_BITS, 7);
223
224 out0_r = (v8i16)__msa_pckev_b((v16i8)out0_l, (v16i8)out0_r);
225 out1_r = (v8i16)__msa_pckev_b((v16i8)out1_l, (v16i8)out1_r);
226 out2_r = (v8i16)__msa_pckev_b((v16i8)out2_l, (v16i8)out2_r);
227 out3_r = (v8i16)__msa_pckev_b((v16i8)out3_l, (v16i8)out3_r);
228
229 XORI_B_4VECS_UB(out0_r, out1_r, out2_r, out3_r,
230 tmp0, tmp1, tmp2, tmp3, 128);
231
232 STORE_4VECS_UB(dst_tmp, dst_stride, tmp0, tmp1, tmp2, tmp3);
233 dst_tmp += (4 * dst_stride); 263 dst_tmp += (4 * dst_stride);
234 264
235 src10_r = src54_r; 265 src10_r = src54_r;
236 src32_r = src76_r; 266 src32_r = src76_r;
237 src54_r = src98_r; 267 src54_r = src98_r;
238 src21_r = src65_r; 268 src21_r = src65_r;
239 src43_r = src87_r; 269 src43_r = src87_r;
240 src65_r = src109_r; 270 src65_r = src109_r;
241
242 src10_l = src54_l; 271 src10_l = src54_l;
243 src32_l = src76_l; 272 src32_l = src76_l;
244 src54_l = src98_l; 273 src54_l = src98_l;
245 src21_l = src65_l; 274 src21_l = src65_l;
246 src43_l = src87_l; 275 src43_l = src87_l;
247 src65_l = src109_l; 276 src65_l = src109_l;
248
249 src6 = src10; 277 src6 = src10;
250 } 278 }
251 279
252 src += 16; 280 src += 16;
253 dst += 16; 281 dst += 16;
254 } 282 }
255 } 283 }
256 284
257 static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
258 uint8_t *dst, int32_t dst_stride,
259 int8_t *filter, int32_t height) {
260 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride,
261 filter, height, 16);
262 }
263
264 static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride, 285 static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride,
265 uint8_t *dst, int32_t dst_stride, 286 uint8_t *dst, int32_t dst_stride,
266 int8_t *filter, int32_t height) { 287 int8_t *filter, int32_t height) {
267 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, 288 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
268 filter, height, 32); 289 32);
269 } 290 }
270 291
271 static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride, 292 static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride,
272 uint8_t *dst, int32_t dst_stride, 293 uint8_t *dst, int32_t dst_stride,
273 int8_t *filter, int32_t height) { 294 int8_t *filter, int32_t height) {
274 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, 295 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
275 filter, height, 64); 296 64);
276 } 297 }
277 298
278 static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride, 299 static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
279 uint8_t *dst, int32_t dst_stride, 300 uint8_t *dst, int32_t dst_stride,
280 int8_t *filter) { 301 int8_t *filter) {
281 uint32_t out0, out1, out2, out3;
282 v16i8 src0, src1, src2, src3, src4; 302 v16i8 src0, src1, src2, src3, src4;
283 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332; 303 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
284 v16i8 filt0; 304 v16u8 filt0;
285 v8u16 filt; 305 v8i16 filt;
306 v8u16 tmp0, tmp1;
286 307
287 filt = LOAD_UH(filter); 308 filt = LD_SH(filter);
288 filt0 = (v16i8)__msa_splati_h((v8i16)filt, 0); 309 filt0 = (v16u8)__msa_splati_h(filt, 0);
289 310
290 LOAD_5VECS_SB(src, src_stride, src0, src1, src2, src3, src4); 311 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
291 src += (5 * src_stride); 312 src += (5 * src_stride);
292 313
293 ILVR_B_4VECS_SB(src0, src1, src2, src3, src1, src2, src3, src4, 314 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
294 src10_r, src21_r, src32_r, src43_r); 315 src32_r, src43_r);
295 316 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
296 ILVR_D_2VECS_SB(src2110, src21_r, src10_r, src4332, src43_r, src32_r); 317 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
297 318 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
298 src2110 = (v16i8)__msa_dotp_u_h((v16u8)src2110, (v16u8)filt0); 319 SAT_UH2_UH(tmp0, tmp1, 7);
299 src4332 = (v16i8)__msa_dotp_u_h((v16u8)src4332, (v16u8)filt0); 320 src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
300 321 ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
301 src2110 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src2110, FILTER_BITS, 7);
302 src4332 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src4332, FILTER_BITS, 7);
303
304 src2110 = (v16i8)__msa_pckev_b((v16i8)src4332, (v16i8)src2110);
305
306 out0 = __msa_copy_u_w((v4i32)src2110, 0);
307 out1 = __msa_copy_u_w((v4i32)src2110, 1);
308 out2 = __msa_copy_u_w((v4i32)src2110, 2);
309 out3 = __msa_copy_u_w((v4i32)src2110, 3);
310
311 STORE_WORD(dst, out0);
312 dst += dst_stride;
313 STORE_WORD(dst, out1);
314 dst += dst_stride;
315 STORE_WORD(dst, out2);
316 dst += dst_stride;
317 STORE_WORD(dst, out3);
318 } 322 }
319 323
320 static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride, 324 static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
321 uint8_t *dst, int32_t dst_stride, 325 uint8_t *dst, int32_t dst_stride,
322 int8_t *filter) { 326 int8_t *filter) {
323 uint32_t out0, out1, out2, out3, out4, out5, out6, out7;
324 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 327 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
325 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r; 328 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
326 v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776; 329 v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
327 v16i8 filt0; 330 v8u16 tmp0, tmp1, tmp2, tmp3;
328 v8u16 filt; 331 v16u8 filt0;
332 v8i16 filt;
329 333
330 filt = LOAD_UH(filter); 334 filt = LD_SH(filter);
331 filt0 = (v16i8)__msa_splati_h((v8i16)filt, 0); 335 filt0 = (v16u8)__msa_splati_h(filt, 0);
332 336
333 LOAD_8VECS_SB(src, src_stride, 337 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
334 src0, src1, src2, src3, src4, src5, src6, src7);
335 src += (8 * src_stride); 338 src += (8 * src_stride);
336 339
337 src8 = LOAD_SB(src); 340 src8 = LD_SB(src);
338 src += src_stride; 341 src += src_stride;
339 342
340 ILVR_B_8VECS_SB(src0, src1, src2, src3, src4, src5, src6, src7, 343 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
341 src1, src2, src3, src4, src5, src6, src7, src8, 344 src32_r, src43_r);
342 src10_r, src21_r, src32_r, src43_r, 345 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
343 src54_r, src65_r, src76_r, src87_r); 346 src76_r, src87_r);
344 347 ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
345 ILVR_D_4VECS_SB(src2110, src21_r, src10_r, src4332, src43_r, src32_r, 348 src87_r, src76_r, src2110, src4332, src6554, src8776);
346 src6554, src65_r, src54_r, src8776, src87_r, src76_r); 349 DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
347 350 tmp0, tmp1, tmp2, tmp3);
348 src2110 = (v16i8)__msa_dotp_u_h((v16u8)src2110, (v16u8)filt0); 351 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
349 src4332 = (v16i8)__msa_dotp_u_h((v16u8)src4332, (v16u8)filt0); 352 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
350 src6554 = (v16i8)__msa_dotp_u_h((v16u8)src6554, (v16u8)filt0); 353 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
351 src8776 = (v16i8)__msa_dotp_u_h((v16u8)src8776, (v16u8)filt0); 354 ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
352 355 ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
353 src2110 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src2110, FILTER_BITS, 7);
354 src4332 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src4332, FILTER_BITS, 7);
355 src6554 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src6554, FILTER_BITS, 7);
356 src8776 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src8776, FILTER_BITS, 7);
357
358 src2110 = (v16i8)__msa_pckev_b((v16i8)src4332, (v16i8)src2110);
359 src4332 = (v16i8)__msa_pckev_b((v16i8)src8776, (v16i8)src6554);
360
361 out0 = __msa_copy_u_w((v4i32)src2110, 0);
362 out1 = __msa_copy_u_w((v4i32)src2110, 1);
363 out2 = __msa_copy_u_w((v4i32)src2110, 2);
364 out3 = __msa_copy_u_w((v4i32)src2110, 3);
365 out4 = __msa_copy_u_w((v4i32)src4332, 0);
366 out5 = __msa_copy_u_w((v4i32)src4332, 1);
367 out6 = __msa_copy_u_w((v4i32)src4332, 2);
368 out7 = __msa_copy_u_w((v4i32)src4332, 3);
369
370 STORE_WORD(dst, out0);
371 dst += dst_stride;
372 STORE_WORD(dst, out1);
373 dst += dst_stride;
374 STORE_WORD(dst, out2);
375 dst += dst_stride;
376 STORE_WORD(dst, out3);
377 dst += dst_stride;
378 STORE_WORD(dst, out4);
379 dst += dst_stride;
380 STORE_WORD(dst, out5);
381 dst += dst_stride;
382 STORE_WORD(dst, out6);
383 dst += dst_stride;
384 STORE_WORD(dst, out7);
385 } 356 }
386 357
387 static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride, 358 static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride,
388 uint8_t *dst, int32_t dst_stride, 359 uint8_t *dst, int32_t dst_stride,
389 int8_t *filter, int32_t height) { 360 int8_t *filter, int32_t height) {
390 if (4 == height) { 361 if (4 == height) {
391 common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); 362 common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
392 } else if (8 == height) { 363 } else if (8 == height) {
393 common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); 364 common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
394 } 365 }
395 } 366 }
396 367
397 static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride, 368 static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
398 uint8_t *dst, int32_t dst_stride, 369 uint8_t *dst, int32_t dst_stride,
399 int8_t *filter) { 370 int8_t *filter) {
400 v16u8 src0, src1, src2, src3, src4; 371 v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
401 v16u8 vec0, vec1, vec2, vec3, filt0; 372 v16i8 out0, out1;
402 v8u16 tmp0, tmp1, tmp2, tmp3; 373 v8u16 tmp0, tmp1, tmp2, tmp3;
403 v8u16 filt; 374 v8i16 filt;
404 375
405 /* rearranging filter_y */ 376 /* rearranging filter_y */
406 filt = LOAD_UH(filter); 377 filt = LD_SH(filter);
407 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 378 filt0 = (v16u8)__msa_splati_h(filt, 0);
408 379
409 LOAD_5VECS_UB(src, src_stride, src0, src1, src2, src3, src4); 380 LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
410 381 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
411 ILVR_B_2VECS_UB(src0, src1, src1, src2, vec0, vec1); 382 ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
412 ILVR_B_2VECS_UB(src2, src3, src3, src4, vec2, vec3); 383 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
413 384 tmp2, tmp3);
414 /* filter calc */ 385 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
415 tmp0 = __msa_dotp_u_h(vec0, filt0); 386 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
416 tmp1 = __msa_dotp_u_h(vec1, filt0); 387 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
417 tmp2 = __msa_dotp_u_h(vec2, filt0); 388 ST8x4_UB(out0, out1, dst, dst_stride);
418 tmp3 = __msa_dotp_u_h(vec3, filt0);
419
420 tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
421 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
422 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
423 tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
424
425 PCKEV_B_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
426 } 389 }
427 390
428 static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, 391 static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
429 uint8_t *dst, int32_t dst_stride, 392 uint8_t *dst, int32_t dst_stride,
430 int8_t *filter, int32_t height) { 393 int8_t *filter, int32_t height) {
431 uint32_t loop_cnt; 394 uint32_t loop_cnt;
432 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; 395 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
433 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; 396 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
397 v16i8 out0, out1;
434 v8u16 tmp0, tmp1, tmp2, tmp3; 398 v8u16 tmp0, tmp1, tmp2, tmp3;
435 v8u16 filt; 399 v8i16 filt;
436 400
437 /* rearranging filter_y */ 401 /* rearranging filter_y */
438 filt = LOAD_UH(filter); 402 filt = LD_SH(filter);
439 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 403 filt0 = (v16u8)__msa_splati_h(filt, 0);
440 404
441 src0 = LOAD_UB(src); 405 src0 = LD_UB(src);
442 src += src_stride; 406 src += src_stride;
443 407
444 for (loop_cnt = (height >> 3); loop_cnt--;) { 408 for (loop_cnt = (height >> 3); loop_cnt--;) {
445 LOAD_8VECS_UB(src, src_stride, 409 LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
446 src1, src2, src3, src4, src5, src6, src7, src8);
447 src += (8 * src_stride); 410 src += (8 * src_stride);
448 411
449 ILVR_B_4VECS_UB(src0, src1, src2, src3, src1, src2, src3, src4, 412 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1,
450 vec0, vec1, vec2, vec3); 413 vec2, vec3);
451 414 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5,
452 ILVR_B_4VECS_UB(src4, src5, src6, src7, src5, src6, src7, src8, 415 vec6, vec7);
453 vec4, vec5, vec6, vec7); 416 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
454 417 tmp2, tmp3);
455 tmp0 = __msa_dotp_u_h(vec0, filt0); 418 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
456 tmp1 = __msa_dotp_u_h(vec1, filt0); 419 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
457 tmp2 = __msa_dotp_u_h(vec2, filt0); 420 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
458 tmp3 = __msa_dotp_u_h(vec3, filt0); 421 ST8x4_UB(out0, out1, dst, dst_stride);
459
460 tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
461 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
462 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
463 tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
464
465 PCKEV_B_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
466 dst += (4 * dst_stride); 422 dst += (4 * dst_stride);
467 423
468 tmp0 = __msa_dotp_u_h(vec4, filt0); 424 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
469 tmp1 = __msa_dotp_u_h(vec5, filt0); 425 tmp2, tmp3);
470 tmp2 = __msa_dotp_u_h(vec6, filt0); 426 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
471 tmp3 = __msa_dotp_u_h(vec7, filt0); 427 SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
472 428 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
473 tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7); 429 ST8x4_UB(out0, out1, dst, dst_stride);
474 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
475 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
476 tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
477
478 PCKEV_B_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
479 dst += (4 * dst_stride); 430 dst += (4 * dst_stride);
480 431
481 src0 = src8; 432 src0 = src8;
482 } 433 }
483 } 434 }
484 435
485 static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride, 436 static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride,
486 uint8_t *dst, int32_t dst_stride, 437 uint8_t *dst, int32_t dst_stride,
487 int8_t *filter, int32_t height) { 438 int8_t *filter, int32_t height) {
488 if (4 == height) { 439 if (4 == height) {
489 common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); 440 common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
490 } else { 441 } else {
491 common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); 442 common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
492 } 443 }
493 } 444 }
494 445
495 static void common_vt_2t_16w_msa(const uint8_t *src, int32_t src_stride, 446 static void common_vt_2t_16w_msa(const uint8_t *src, int32_t src_stride,
496 uint8_t *dst, int32_t dst_stride, 447 uint8_t *dst, int32_t dst_stride,
497 int8_t *filter, int32_t height) { 448 int8_t *filter, int32_t height) {
498 uint32_t loop_cnt; 449 uint32_t loop_cnt;
499 v16u8 src0, src1, src2, src3, src4; 450 v16u8 src0, src1, src2, src3, src4;
500 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; 451 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
501 v8u16 tmp0, tmp1, tmp2, tmp3; 452 v8u16 tmp0, tmp1, tmp2, tmp3;
502 v8u16 filt; 453 v8i16 filt;
503 454
504 /* rearranging filter_y */ 455 /* rearranging filter_y */
505 filt = LOAD_UH(filter); 456 filt = LD_SH(filter);
506 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 457 filt0 = (v16u8)__msa_splati_h(filt, 0);
507 458
508 src0 = LOAD_UB(src); 459 src0 = LD_UB(src);
509 src += src_stride; 460 src += src_stride;
510 461
511 for (loop_cnt = (height >> 2); loop_cnt--;) { 462 for (loop_cnt = (height >> 2); loop_cnt--;) {
512 LOAD_4VECS_UB(src, src_stride, src1, src2, src3, src4); 463 LD_UB4(src, src_stride, src1, src2, src3, src4);
513 src += (4 * src_stride); 464 src += (4 * src_stride);
514 465
515 ILV_B_LRLR_UB(src0, src1, src1, src2, vec1, vec0, vec3, vec2); 466 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
516 467 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
517 tmp0 = __msa_dotp_u_h(vec0, filt0); 468 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
518 tmp1 = __msa_dotp_u_h(vec1, filt0); 469 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
519 470 SAT_UH2_UH(tmp0, tmp1, 7);
520 tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7); 471 PCKEV_ST_SB(tmp0, tmp1, dst);
521 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
522
523 PCKEV_B_STORE_VEC(tmp1, tmp0, dst);
524 dst += dst_stride; 472 dst += dst_stride;
525 473
526 ILV_B_LRLR_UB(src2, src3, src3, src4, vec5, vec4, vec7, vec6); 474 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
527 475 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
528 tmp2 = __msa_dotp_u_h(vec2, filt0); 476 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
529 tmp3 = __msa_dotp_u_h(vec3, filt0); 477 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
530 478 SAT_UH2_UH(tmp2, tmp3, 7);
531 tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7); 479 PCKEV_ST_SB(tmp2, tmp3, dst);
532 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
533
534 PCKEV_B_STORE_VEC(tmp3, tmp2, dst);
535 dst += dst_stride; 480 dst += dst_stride;
536 481
537 tmp0 = __msa_dotp_u_h(vec4, filt0); 482 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
538 tmp1 = __msa_dotp_u_h(vec5, filt0); 483 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
539 484 SAT_UH2_UH(tmp0, tmp1, 7);
540 tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7); 485 PCKEV_ST_SB(tmp0, tmp1, dst);
541 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
542
543 PCKEV_B_STORE_VEC(tmp1, tmp0, dst);
544 dst += dst_stride; 486 dst += dst_stride;
545 487
546 tmp2 = __msa_dotp_u_h(vec6, filt0); 488 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
547 tmp3 = __msa_dotp_u_h(vec7, filt0); 489 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
548 490 SAT_UH2_UH(tmp2, tmp3, 7);
549 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7); 491 PCKEV_ST_SB(tmp2, tmp3, dst);
550 tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
551
552 PCKEV_B_STORE_VEC(tmp3, tmp2, dst);
553 dst += dst_stride; 492 dst += dst_stride;
554 493
555 src0 = src4; 494 src0 = src4;
556 } 495 }
557 } 496 }
558 497
559 static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride, 498 static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride,
560 uint8_t *dst, int32_t dst_stride, 499 uint8_t *dst, int32_t dst_stride,
561 int8_t *filter, int32_t height) { 500 int8_t *filter, int32_t height) {
562 uint32_t loop_cnt; 501 uint32_t loop_cnt;
563 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; 502 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
564 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; 503 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
565 v8u16 tmp0, tmp1, tmp2, tmp3; 504 v8u16 tmp0, tmp1, tmp2, tmp3;
566 v8u16 filt; 505 v8i16 filt;
567 506
568 /* rearranging filter_y */ 507 /* rearranging filter_y */
569 filt = LOAD_UH(filter); 508 filt = LD_SH(filter);
570 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 509 filt0 = (v16u8)__msa_splati_h(filt, 0);
571 510
572 src0 = LOAD_UB(src); 511 src0 = LD_UB(src);
573 src5 = LOAD_UB(src + 16); 512 src5 = LD_UB(src + 16);
574 src += src_stride; 513 src += src_stride;
575 514
576 for (loop_cnt = (height >> 2); loop_cnt--;) { 515 for (loop_cnt = (height >> 2); loop_cnt--;) {
577 LOAD_4VECS_UB(src, src_stride, src1, src2, src3, src4); 516 LD_UB4(src, src_stride, src1, src2, src3, src4);
517 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
518 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
578 519
579 ILV_B_LRLR_UB(src0, src1, src1, src2, vec1, vec0, vec3, vec2); 520 LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
580
581 LOAD_4VECS_UB(src + 16, src_stride, src6, src7, src8, src9);
582 src += (4 * src_stride); 521 src += (4 * src_stride);
583 522
584 tmp0 = __msa_dotp_u_h(vec0, filt0); 523 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
585 tmp1 = __msa_dotp_u_h(vec1, filt0); 524 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
525 SAT_UH2_UH(tmp0, tmp1, 7);
526 PCKEV_ST_SB(tmp0, tmp1, dst);
527 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
528 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
529 SAT_UH2_UH(tmp2, tmp3, 7);
530 PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
586 531
587 tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7); 532 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
588 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7); 533 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
534 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
535 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
536 SAT_UH2_UH(tmp0, tmp1, 7);
537 PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride);
589 538
590 PCKEV_B_STORE_VEC(tmp1, tmp0, dst); 539 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
540 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
541 SAT_UH2_UH(tmp2, tmp3, 7);
542 PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride);
591 543
592 tmp2 = __msa_dotp_u_h(vec2, filt0); 544 ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
593 tmp3 = __msa_dotp_u_h(vec3, filt0); 545 ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
546 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
547 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
548 SAT_UH2_UH(tmp0, tmp1, 7);
549 PCKEV_ST_SB(tmp0, tmp1, dst + 16);
594 550
595 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7); 551 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
596 tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7); 552 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
553 SAT_UH2_UH(tmp2, tmp3, 7);
554 PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride);
597 555
598 PCKEV_B_STORE_VEC(tmp3, tmp2, dst + dst_stride); 556 ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
557 ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
558 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
559 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
560 SAT_UH2_UH(tmp0, tmp1, 7);
561 PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride);
599 562
600 ILV_B_LRLR_UB(src2, src3, src3, src4, vec5, vec4, vec7, vec6); 563 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
601 564 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
602 tmp0 = __msa_dotp_u_h(vec4, filt0); 565 SAT_UH2_UH(tmp2, tmp3, 7);
603 tmp1 = __msa_dotp_u_h(vec5, filt0); 566 PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride);
604
605 tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
606 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
607
608 PCKEV_B_STORE_VEC(tmp1, tmp0, dst + 2 * dst_stride);
609
610 tmp2 = __msa_dotp_u_h(vec6, filt0);
611 tmp3 = __msa_dotp_u_h(vec7, filt0);
612
613 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
614 tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
615
616 PCKEV_B_STORE_VEC(tmp3, tmp2, dst + 3 * dst_stride);
617
618 ILV_B_LRLR_UB(src5, src6, src6, src7, vec1, vec0, vec3, vec2);
619
620 tmp0 = __msa_dotp_u_h(vec0, filt0);
621 tmp1 = __msa_dotp_u_h(vec1, filt0);
622
623 tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
624 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
625
626 PCKEV_B_STORE_VEC(tmp1, tmp0, dst + 16);
627
628 tmp2 = __msa_dotp_u_h(vec2, filt0);
629 tmp3 = __msa_dotp_u_h(vec3, filt0);
630
631 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
632 tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
633
634 PCKEV_B_STORE_VEC(tmp3, tmp2, dst + 16 + dst_stride);
635
636 ILV_B_LRLR_UB(src7, src8, src8, src9, vec5, vec4, vec7, vec6);
637
638 tmp0 = __msa_dotp_u_h(vec4, filt0);
639 tmp1 = __msa_dotp_u_h(vec5, filt0);
640
641 tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
642 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
643
644 PCKEV_B_STORE_VEC(tmp1, tmp0, dst + 16 + 2 * dst_stride);
645
646 tmp2 = __msa_dotp_u_h(vec6, filt0);
647 tmp3 = __msa_dotp_u_h(vec7, filt0);
648
649 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
650 tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
651
652 PCKEV_B_STORE_VEC(tmp3, tmp2, dst + 16 + 3 * dst_stride);
653 dst += (4 * dst_stride); 567 dst += (4 * dst_stride);
654 568
655 src0 = src4; 569 src0 = src4;
656 src5 = src9; 570 src5 = src9;
657 } 571 }
658 } 572 }
659 573
660 static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride, 574 static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride,
661 uint8_t *dst, int32_t dst_stride, 575 uint8_t *dst, int32_t dst_stride,
662 int8_t *filter, int32_t height) { 576 int8_t *filter, int32_t height) {
663 uint32_t loop_cnt; 577 uint32_t loop_cnt;
664 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 578 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
665 v16u8 src8, src9, src10, src11; 579 v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
666 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
667 v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 580 v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
668 v8u16 filt; 581 v8i16 filt;
669 582
670 /* rearranging filter_y */ 583 /* rearranging filter_y */
671 filt = LOAD_UH(filter); 584 filt = LD_SH(filter);
672 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); 585 filt0 = (v16u8)__msa_splati_h(filt, 0);
673 586
674 LOAD_4VECS_UB(src, 16, src0, src3, src6, src9); 587 LD_UB4(src, 16, src0, src3, src6, src9);
675 src += src_stride; 588 src += src_stride;
676 589
677 for (loop_cnt = (height >> 1); loop_cnt--;) { 590 for (loop_cnt = (height >> 1); loop_cnt--;) {
678 LOAD_2VECS_UB(src, src_stride, src1, src2); 591 LD_UB2(src, src_stride, src1, src2);
679 LOAD_2VECS_UB(src + 16, src_stride, src4, src5); 592 LD_UB2(src + 16, src_stride, src4, src5);
680 LOAD_2VECS_UB(src + 32, src_stride, src7, src8); 593 LD_UB2(src + 32, src_stride, src7, src8);
681 LOAD_2VECS_UB(src + 48, src_stride, src10, src11); 594 LD_UB2(src + 48, src_stride, src10, src11);
682 src += (2 * src_stride); 595 src += (2 * src_stride);
683 596
684 ILV_B_LRLR_UB(src0, src1, src1, src2, vec1, vec0, vec3, vec2); 597 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
598 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
599 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
600 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
601 SAT_UH2_UH(tmp0, tmp1, 7);
602 PCKEV_ST_SB(tmp0, tmp1, dst);
685 603
686 tmp0 = __msa_dotp_u_h(vec0, filt0); 604 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
687 tmp1 = __msa_dotp_u_h(vec1, filt0); 605 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
606 SAT_UH2_UH(tmp2, tmp3, 7);
607 PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
688 608
689 tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7); 609 ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
690 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7); 610 ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
611 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
612 SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
613 SAT_UH2_UH(tmp4, tmp5, 7);
614 PCKEV_ST_SB(tmp4, tmp5, dst + 16);
691 615
692 PCKEV_B_STORE_VEC(tmp1, tmp0, dst); 616 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
617 SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
618 SAT_UH2_UH(tmp6, tmp7, 7);
619 PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride);
693 620
694 tmp2 = __msa_dotp_u_h(vec2, filt0); 621 ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
695 tmp3 = __msa_dotp_u_h(vec3, filt0); 622 ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
623 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
624 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
625 SAT_UH2_UH(tmp0, tmp1, 7);
626 PCKEV_ST_SB(tmp0, tmp1, dst + 32);
696 627
697 tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7); 628 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
698 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7); 629 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
630 SAT_UH2_UH(tmp2, tmp3, 7);
631 PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride);
699 632
700 PCKEV_B_STORE_VEC(tmp3, tmp2, dst + dst_stride); 633 ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
634 ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
635 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
636 SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
637 SAT_UH2_UH(tmp4, tmp5, 7);
638 PCKEV_ST_SB(tmp4, tmp5, dst + 48);
701 639
702 ILV_B_LRLR_UB(src3, src4, src4, src5, vec5, vec4, vec7, vec6); 640 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
703 641 SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
704 tmp4 = __msa_dotp_u_h(vec4, filt0); 642 SAT_UH2_UH(tmp6, tmp7, 7);
705 tmp5 = __msa_dotp_u_h(vec5, filt0); 643 PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride);
706
707 tmp4 = SRARI_SATURATE_UNSIGNED_H(tmp4, FILTER_BITS, 7);
708 tmp5 = SRARI_SATURATE_UNSIGNED_H(tmp5, FILTER_BITS, 7);
709
710 PCKEV_B_STORE_VEC(tmp5, tmp4, dst + 16);
711
712 tmp6 = __msa_dotp_u_h(vec6, filt0);
713 tmp7 = __msa_dotp_u_h(vec7, filt0);
714
715 tmp6 = SRARI_SATURATE_UNSIGNED_H(tmp6, FILTER_BITS, 7);
716 tmp7 = SRARI_SATURATE_UNSIGNED_H(tmp7, FILTER_BITS, 7);
717
718 PCKEV_B_STORE_VEC(tmp7, tmp6, dst + 16 + dst_stride);
719
720 ILV_B_LRLR_UB(src6, src7, src7, src8, vec1, vec0, vec3, vec2);
721
722 tmp0 = __msa_dotp_u_h(vec0, filt0);
723 tmp1 = __msa_dotp_u_h(vec1, filt0);
724
725 tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
726 tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
727
728 PCKEV_B_STORE_VEC(tmp1, tmp0, dst + 32);
729
730 tmp2 = __msa_dotp_u_h(vec2, filt0);
731 tmp3 = __msa_dotp_u_h(vec3, filt0);
732
733 tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
734 tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
735
736 PCKEV_B_STORE_VEC(tmp3, tmp2, dst + 32 + dst_stride);
737
738 ILV_B_LRLR_UB(src9, src10, src10, src11, vec5, vec4, vec7, vec6);
739
740 tmp4 = __msa_dotp_u_h(vec4, filt0);
741 tmp5 = __msa_dotp_u_h(vec5, filt0);
742
743 tmp4 = SRARI_SATURATE_UNSIGNED_H(tmp4, FILTER_BITS, 7);
744 tmp5 = SRARI_SATURATE_UNSIGNED_H(tmp5, FILTER_BITS, 7);
745
746 PCKEV_B_STORE_VEC(tmp5, tmp4, dst + 48);
747
748 tmp6 = __msa_dotp_u_h(vec6, filt0);
749 tmp7 = __msa_dotp_u_h(vec7, filt0);
750
751 tmp6 = SRARI_SATURATE_UNSIGNED_H(tmp6, FILTER_BITS, 7);
752 tmp7 = SRARI_SATURATE_UNSIGNED_H(tmp7, FILTER_BITS, 7);
753
754 PCKEV_B_STORE_VEC(tmp7, tmp6, dst + 48 + dst_stride);
755 dst += (2 * dst_stride); 644 dst += (2 * dst_stride);
756 645
757 src0 = src2; 646 src0 = src2;
758 src3 = src5; 647 src3 = src5;
759 src6 = src8; 648 src6 = src8;
760 src9 = src11; 649 src9 = src11;
761 } 650 }
762 } 651 }
763 652
764 void vp9_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, 653 void vp9_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
(...skipping 82 matching lines...) Expand 10 before | Expand all | Expand 10 after
847 filt_ver, h); 736 filt_ver, h);
848 break; 737 break;
849 default: 738 default:
850 vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, 739 vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
851 filter_x, x_step_q4, filter_y, y_step_q4, 740 filter_x, x_step_q4, filter_y, y_step_q4,
852 w, h); 741 w, h);
853 break; 742 break;
854 } 743 }
855 } 744 }
856 } 745 }
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/mips/msa/vp9_convolve8_msa.c ('k') | source/libvpx/vp9/common/mips/msa/vp9_convolve_avg_msa.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698