Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(145)

Side by Side Diff: source/libvpx/vp9/common/mips/msa/vp9_convolve_msa.h

Issue 1169543007: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #ifndef VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_ 11 #ifndef VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_
12 #define VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_ 12 #define VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_
13 13
14 #include "vp9/common/vp9_filter.h" 14 #include "vp9/common/vp9_filter.h"
15 #include "vp9/common/mips/msa/vp9_macros_msa.h" 15 #include "vp9/common/mips/msa/vp9_macros_msa.h"
16 16
17 extern const uint8_t mc_filt_mask_arr[16 * 3]; 17 extern const uint8_t mc_filt_mask_arr[16 * 3];
18 18
19 #define HORIZ_8TAP_FILT(src, mask0, mask1, mask2, mask3, \ 19 #define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \
20 filt_h0, filt_h1, filt_h2, filt_h3) ({ \ 20 filt0, filt1, filt2, filt3) ({ \
21 v8i16 vec0, vec1, vec2, vec3, horiz_out; \ 21 v8i16 tmp0, tmp1; \
22 \ 22 \
23 vec0 = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src), (v16i8)(src)); \ 23 tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \
24 vec0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)(filt_h0)); \ 24 tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1); \
25 vec1 = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src), (v16i8)(src)); \ 25 tmp1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2); \
26 vec0 = __msa_dpadd_s_h(vec0, (v16i8)(filt_h1), (v16i8)vec1); \ 26 tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)vec3, (v16i8)filt3); \
27 vec2 = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src), (v16i8)(src)); \ 27 tmp0 = __msa_adds_s_h(tmp0, tmp1); \
28 vec2 = __msa_dotp_s_h((v16i8)vec2, (v16i8)(filt_h2)); \ 28 \
29 vec3 = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src), (v16i8)(src)); \ 29 tmp0; \
30 vec2 = __msa_dpadd_s_h(vec2, (v16i8)(filt_h3), (v16i8)vec3); \
31 vec0 = __msa_adds_s_h(vec0, vec2); \
32 horiz_out = SRARI_SATURATE_SIGNED_H(vec0, FILTER_BITS, 7); \
33 \
34 horiz_out; \
35 }) 30 })
36 31
37 #define HORIZ_8TAP_FILT_2VECS(src0, src1, mask0, mask1, mask2, mask3, \ 32 #define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, \
38 filt_h0, filt_h1, filt_h2, filt_h3) ({ \ 33 filt_h0, filt_h1, filt_h2, filt_h3) ({ \
39 v8i16 vec0, vec1, vec2, vec3, horiz_out; \ 34 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
40 \ 35 v8i16 hz_out_m; \
41 vec0 = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src1), (v16i8)(src0)); \ 36 \
42 vec0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)(filt_h0)); \ 37 VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, \
43 vec1 = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src1), (v16i8)(src0)); \ 38 vec0_m, vec1_m, vec2_m, vec3_m); \
44 vec0 = __msa_dpadd_s_h(vec0, (v16i8)(filt_h1), (v16i8)vec1); \ 39 hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, \
45 vec2 = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src1), (v16i8)(src0)); \ 40 filt_h0, filt_h1, filt_h2, filt_h3); \
46 vec2 = __msa_dotp_s_h((v16i8)vec2, (v16i8)(filt_h2)); \ 41 \
47 vec3 = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src1), (v16i8)(src0)); \ 42 hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS); \
48 vec2 = __msa_dpadd_s_h(vec2, ((v16i8)filt_h3), (v16i8)vec3); \ 43 hz_out_m = __msa_sat_s_h(hz_out_m, 7); \
49 vec0 = __msa_adds_s_h(vec0, vec2); \ 44 \
50 horiz_out = (v8i16)SRARI_SATURATE_SIGNED_H(vec0, FILTER_BITS, 7); \ 45 hz_out_m; \
51 \
52 horiz_out; \
53 }) 46 })
54 47
55 #define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \ 48 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
56 filt0, filt1, filt2, filt3) ({ \ 49 mask0, mask1, mask2, mask3, \
57 v8i16 tmp0, tmp1; \ 50 filt0, filt1, filt2, filt3, \
58 \ 51 out0, out1) { \
59 tmp0 = __msa_dotp_s_h((v16i8)(vec0), (v16i8)(filt0)); \ 52 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
60 tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)(vec1), (v16i8)(filt1)); \ 53 v8i16 res0_m, res1_m, res2_m, res3_m; \
61 tmp1 = __msa_dotp_s_h((v16i8)(vec2), (v16i8)(filt2)); \ 54 \
62 tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)(vec3), ((v16i8)filt3)); \ 55 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
63 tmp0 = __msa_adds_s_h(tmp0, tmp1); \ 56 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \
64 \ 57 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
65 tmp0; \ 58 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \
66 }) 59 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
67 60 DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \
68 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \ 61 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
69 mask0, mask1, mask2, mask3, \ 62 DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \
70 filt0, filt1, filt2, filt3, \ 63 ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \
71 out0, out1) { \
72 v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
73 v8i16 res0_m, res1_m, res2_m, res3_m; \
74 \
75 vec0_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src1), (v16i8)(src0)); \
76 vec1_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src3), (v16i8)(src2)); \
77 \
78 res0_m = __msa_dotp_s_h((v16i8)vec0_m, (v16i8)(filt0)); \
79 res1_m = __msa_dotp_s_h((v16i8)vec1_m, (v16i8)(filt0)); \
80 \
81 vec2_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src1), (v16i8)(src0)); \
82 vec3_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src3), (v16i8)(src2)); \
83 \
84 res0_m = __msa_dpadd_s_h(res0_m, (filt1), (v16i8)vec2_m); \
85 res1_m = __msa_dpadd_s_h(res1_m, (filt1), (v16i8)vec3_m); \
86 \
87 vec4_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src1), (v16i8)(src0)); \
88 vec5_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src3), (v16i8)(src2)); \
89 \
90 res2_m = __msa_dotp_s_h((v16i8)(filt2), (v16i8)vec4_m); \
91 res3_m = __msa_dotp_s_h((v16i8)(filt2), (v16i8)vec5_m); \
92 \
93 vec6_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src1), (v16i8)(src0)); \
94 vec7_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src3), (v16i8)(src2)); \
95 \
96 res2_m = __msa_dpadd_s_h(res2_m, (v16i8)(filt3), (v16i8)vec6_m); \
97 res3_m = __msa_dpadd_s_h(res3_m, (v16i8)(filt3), (v16i8)vec7_m); \
98 \
99 out0 = __msa_adds_s_h(res0_m, res2_m); \
100 out1 = __msa_adds_s_h(res1_m, res3_m); \
101 } 64 }
102 65
103 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ 66 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
104 mask0, mask1, mask2, mask3, \ 67 mask0, mask1, mask2, mask3, \
105 filt0, filt1, filt2, filt3, \ 68 filt0, filt1, filt2, filt3, \
106 out0, out1, out2, out3) { \ 69 out0, out1, out2, out3) { \
107 v8i16 vec0_m, vec1_m, vec2_m, vec3_m; \ 70 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
108 v8i16 vec4_m, vec5_m, vec6_m, vec7_m; \ 71 v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \
109 v8i16 res0_m, res1_m, res2_m, res3_m; \ 72 \
110 v8i16 res4_m, res5_m, res6_m, res7_m; \ 73 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
111 \ 74 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
112 vec0_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src0), (v16i8)(src0)); \ 75 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
113 vec1_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src1), (v16i8)(src1)); \ 76 res0_m, res1_m, res2_m, res3_m); \
114 vec2_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src2), (v16i8)(src2)); \ 77 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
115 vec3_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src3), (v16i8)(src3)); \ 78 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
116 \ 79 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
117 res0_m = __msa_dotp_s_h((v16i8)vec0_m, (v16i8)(filt0)); \ 80 res4_m, res5_m, res6_m, res7_m); \
118 res1_m = __msa_dotp_s_h((v16i8)vec1_m, (v16i8)(filt0)); \ 81 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
119 res2_m = __msa_dotp_s_h((v16i8)vec2_m, (v16i8)(filt0)); \ 82 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
120 res3_m = __msa_dotp_s_h((v16i8)vec3_m, (v16i8)(filt0)); \ 83 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
121 \ 84 res0_m, res1_m, res2_m, res3_m); \
122 vec0_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src0), (v16i8)(src0)); \ 85 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
123 vec1_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src1), (v16i8)(src1)); \ 86 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
124 vec2_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src2), (v16i8)(src2)); \ 87 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
125 vec3_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src3), (v16i8)(src3)); \ 88 res4_m, res5_m, res6_m, res7_m); \
126 \ 89 ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \
127 res4_m = __msa_dotp_s_h((v16i8)vec0_m, (v16i8)(filt2)); \ 90 res7_m, out0, out1, out2, out3); \
128 res5_m = __msa_dotp_s_h((v16i8)vec1_m, (v16i8)(filt2)); \ 91 }
129 res6_m = __msa_dotp_s_h((v16i8)vec2_m, (v16i8)(filt2)); \ 92
130 res7_m = __msa_dotp_s_h((v16i8)vec3_m, (v16i8)(filt2)); \ 93 #define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) { \
131 \ 94 v16u8 tmp_m; \
132 vec4_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src0), (v16i8)(src0)); \ 95 \
133 vec5_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src1), (v16i8)(src1)); \ 96 tmp_m = PCKEV_XORI128_UB(in1, in0); \
134 vec6_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src2), (v16i8)(src2)); \ 97 tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \
135 vec7_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src3), (v16i8)(src3)); \ 98 ST_UB(tmp_m, (pdst)); \
136 \ 99 }
137 res0_m = __msa_dpadd_s_h(res0_m, (v16i8)(filt1), (v16i8)vec4_m); \ 100
138 res1_m = __msa_dpadd_s_h(res1_m, (v16i8)(filt1), (v16i8)vec5_m); \ 101 #define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) { \
139 res2_m = __msa_dpadd_s_h(res2_m, (v16i8)(filt1), (v16i8)vec6_m); \ 102 v16u8 tmp_m; \
140 res3_m = __msa_dpadd_s_h(res3_m, (v16i8)(filt1), (v16i8)vec7_m); \ 103 \
141 \ 104 tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
142 vec4_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src0), (v16i8)(src0)); \ 105 tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \
143 vec5_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src1), (v16i8)(src1)); \ 106 ST_UB(tmp_m, (pdst)); \
144 vec6_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src2), (v16i8)(src2)); \ 107 }
145 vec7_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src3), (v16i8)(src3)); \ 108
146 \ 109 #define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, \
147 res4_m = __msa_dpadd_s_h(res4_m, (v16i8)(filt3), (v16i8)vec4_m); \ 110 pdst, stride) { \
148 res5_m = __msa_dpadd_s_h(res5_m, (v16i8)(filt3), (v16i8)vec5_m); \ 111 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
149 res6_m = __msa_dpadd_s_h(res6_m, (v16i8)(filt3), (v16i8)vec6_m); \ 112 uint8_t *pdst_m = (uint8_t *)(pdst); \
150 res7_m = __msa_dpadd_s_h(res7_m, (v16i8)(filt3), (v16i8)vec7_m); \ 113 \
151 \ 114 PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \
152 out0 = __msa_adds_s_h(res0_m, res4_m); \ 115 PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
153 out1 = __msa_adds_s_h(res1_m, res5_m); \ 116 AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
154 out2 = __msa_adds_s_h(res2_m, res6_m); \ 117 ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
155 out3 = __msa_adds_s_h(res3_m, res7_m); \
156 } 118 }
157 #endif /* VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_ */ 119 #endif /* VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_ */
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/mips/msa/vp9_convolve_copy_msa.c ('k') | source/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698