OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
| 3 * |
| 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ |
| 10 |
| 11 #ifndef VP9_COMMON_MIPS_MSA_VP9_IDCT_MSA_H_ |
| 12 #define VP9_COMMON_MIPS_MSA_VP9_IDCT_MSA_H_ |
| 13 |
| 14 #include <assert.h> |
| 15 |
| 16 #include "vpx_ports/mem.h" |
| 17 #include "vp9/common/vp9_idct.h" |
| 18 #include "vp9/common/mips/msa/vp9_macros_msa.h" |
| 19 |
| 20 #define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) { \ |
| 21 v8i16 k0_m = __msa_fill_h(cnst0); \ |
| 22 v4i32 s0_m, s1_m, s2_m, s3_m; \ |
| 23 \ |
| 24 s0_m = (v4i32)__msa_fill_h(cnst1); \ |
| 25 k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m); \ |
| 26 \ |
| 27 ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m); \ |
| 28 ILVRL_H2_SW(reg0, reg1, s3_m, s2_m); \ |
| 29 DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m); \ |
| 30 SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \ |
| 31 out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \ |
| 32 \ |
| 33 DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m); \ |
| 34 SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \ |
| 35 out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \ |
| 36 } |
| 37 |
| 38 #define VP9_DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 39 dst0, dst1, dst2, dst3) { \ |
| 40 v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m; \ |
| 41 v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m; \ |
| 42 \ |
| 43 DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5, \ |
| 44 tp0_m, tp2_m, tp3_m, tp4_m); \ |
| 45 DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7, \ |
| 46 tp5_m, tp6_m, tp7_m, tp8_m); \ |
| 47 BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m); \ |
| 48 BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m); \ |
| 49 SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS); \ |
| 50 SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS); \ |
| 51 PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m, \ |
| 52 dst0, dst1, dst2, dst3); \ |
| 53 } |
| 54 |
| 55 #define VP9_DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) ({ \ |
| 56 v8i16 dst_m; \ |
| 57 v4i32 tp0_m, tp1_m; \ |
| 58 \ |
| 59 DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m); \ |
| 60 SRARI_W2_SW(tp1_m, tp0_m, DCT_CONST_BITS); \ |
| 61 dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m); \ |
| 62 \ |
| 63 dst_m; \ |
| 64 }) |
| 65 |
| 66 #define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 67 out0, out1, out2, out3, out4, out5, out6, out7) { \ |
| 68 v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \ |
| 69 v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \ |
| 70 v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \ |
| 71 cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \ |
| 72 v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, \ |
| 73 -cospi_16_64, cospi_24_64, -cospi_24_64, 0, 0 }; \ |
| 74 \ |
| 75 SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \ |
| 76 cnst2_m = -cnst0_m; \ |
| 77 ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ |
| 78 SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m); \ |
| 79 cnst4_m = -cnst2_m; \ |
| 80 ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ |
| 81 \ |
| 82 ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \ |
| 83 ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \ |
| 84 VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \ |
| 85 cnst1_m, cnst2_m, cnst3_m, in7, in0, \ |
| 86 in4, in3); \ |
| 87 \ |
| 88 SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \ |
| 89 cnst2_m = -cnst0_m; \ |
| 90 ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ |
| 91 SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m); \ |
| 92 cnst4_m = -cnst2_m; \ |
| 93 ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ |
| 94 \ |
| 95 ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ |
| 96 ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ |
| 97 \ |
| 98 VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \ |
| 99 cnst1_m, cnst2_m, cnst3_m, in5, in2, \ |
| 100 in6, in1); \ |
| 101 BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \ |
| 102 out7 = -s0_m; \ |
| 103 out0 = s1_m; \ |
| 104 \ |
| 105 SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, \ |
| 106 cnst0_m, cnst1_m, cnst2_m, cnst3_m); \ |
| 107 \ |
| 108 ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \ |
| 109 cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ |
| 110 cnst1_m = cnst0_m; \ |
| 111 \ |
| 112 ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \ |
| 113 ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ |
| 114 VP9_DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \ |
| 115 cnst2_m, cnst3_m, cnst1_m, out1, out6, \ |
| 116 s0_m, s1_m); \ |
| 117 \ |
| 118 SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \ |
| 119 cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ |
| 120 \ |
| 121 ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ |
| 122 ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \ |
| 123 out3 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ |
| 124 out4 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ |
| 125 out2 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \ |
| 126 out5 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \ |
| 127 \ |
| 128 out1 = -out1; \ |
| 129 out3 = -out3; \ |
| 130 out5 = -out5; \ |
| 131 } |
| 132 |
| 133 #define VP9_MADD_SHORT(m0, m1, c0, c1, res0, res1) { \ |
| 134 v4i32 madd0_m, madd1_m, madd2_m, madd3_m; \ |
| 135 v8i16 madd_s0_m, madd_s1_m; \ |
| 136 \ |
| 137 ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m); \ |
| 138 DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m, \ |
| 139 c0, c0, c1, c1, madd0_m, madd1_m, madd2_m, madd3_m); \ |
| 140 SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS); \ |
| 141 PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1); \ |
| 142 } |
| 143 |
| 144 #define VP9_MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, \ |
| 145 out0, out1, out2, out3) { \ |
| 146 v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ |
| 147 v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m; \ |
| 148 \ |
| 149 ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m); \ |
| 150 ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m); \ |
| 151 DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, \ |
| 152 cst0, cst0, cst2, cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ |
| 153 BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, \ |
| 154 m4_m, m5_m, tmp3_m, tmp2_m); \ |
| 155 SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ |
| 156 PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1); \ |
| 157 DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, \ |
| 158 cst1, cst1, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ |
| 159 BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, \ |
| 160 m4_m, m5_m, tmp3_m, tmp2_m); \ |
| 161 SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ |
| 162 PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \ |
| 163 } |
| 164 |
| 165 #define VP9_SET_COSPI_PAIR(c0_h, c1_h) ({ \ |
| 166 v8i16 out0_m, r0_m, r1_m; \ |
| 167 \ |
| 168 r0_m = __msa_fill_h(c0_h); \ |
| 169 r1_m = __msa_fill_h(c1_h); \ |
| 170 out0_m = __msa_ilvev_h(r1_m, r0_m); \ |
| 171 \ |
| 172 out0_m; \ |
| 173 }) |
| 174 |
| 175 #define VP9_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3) { \ |
| 176 uint8_t *dst_m = (uint8_t *) (dst); \ |
| 177 v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \ |
| 178 v16i8 tmp0_m, tmp1_m; \ |
| 179 v16i8 zero_m = { 0 }; \ |
| 180 v8i16 res0_m, res1_m, res2_m, res3_m; \ |
| 181 \ |
| 182 LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m); \ |
| 183 ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m, \ |
| 184 zero_m, dst3_m, res0_m, res1_m, res2_m, res3_m); \ |
| 185 ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3, \ |
| 186 res0_m, res1_m, res2_m, res3_m); \ |
| 187 CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m); \ |
| 188 PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m); \ |
| 189 ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride); \ |
| 190 } |
| 191 |
| 192 #define VP9_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) { \ |
| 193 v8i16 c0_m, c1_m, c2_m, c3_m; \ |
| 194 v8i16 step0_m, step1_m; \ |
| 195 v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ |
| 196 \ |
| 197 c0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \ |
| 198 c1_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \ |
| 199 step0_m = __msa_ilvr_h(in2, in0); \ |
| 200 DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m); \ |
| 201 \ |
| 202 c2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ |
| 203 c3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ |
| 204 step1_m = __msa_ilvr_h(in3, in1); \ |
| 205 DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m); \ |
| 206 SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ |
| 207 \ |
| 208 PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m); \ |
| 209 SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8); \ |
| 210 BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m, \ |
| 211 (v8i16)tmp2_m, (v8i16)tmp3_m, \ |
| 212 out0, out1, out2, out3); \ |
| 213 } |
| 214 |
| 215 #define VP9_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3) { \ |
| 216 v8i16 res0_m, res1_m, c0_m, c1_m; \ |
| 217 v8i16 k1_m, k2_m, k3_m, k4_m; \ |
| 218 v8i16 zero_m = { 0 }; \ |
| 219 v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ |
| 220 v4i32 int0_m, int1_m, int2_m, int3_m; \ |
| 221 v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9, \ |
| 222 sinpi_4_9, -sinpi_1_9, -sinpi_2_9, -sinpi_3_9, \ |
| 223 -sinpi_4_9 }; \ |
| 224 \ |
| 225 SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m); \ |
| 226 ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m); \ |
| 227 ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \ |
| 228 DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m); \ |
| 229 int0_m = tmp2_m + tmp1_m; \ |
| 230 \ |
| 231 SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m); \ |
| 232 ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m); \ |
| 233 DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \ |
| 234 int1_m = tmp0_m + tmp1_m; \ |
| 235 \ |
| 236 c0_m = __msa_splati_h(mask_m, 6); \ |
| 237 ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m); \ |
| 238 ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \ |
| 239 DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \ |
| 240 int2_m = tmp0_m + tmp1_m; \ |
| 241 \ |
| 242 c0_m = __msa_splati_h(mask_m, 6); \ |
| 243 c0_m = __msa_ilvev_h(c0_m, k1_m); \ |
| 244 \ |
| 245 res0_m = __msa_ilvr_h((in1), (in3)); \ |
| 246 tmp0_m = __msa_dotp_s_w(res0_m, c0_m); \ |
| 247 int3_m = tmp2_m + tmp0_m; \ |
| 248 \ |
| 249 res0_m = __msa_ilvr_h((in2), (in3)); \ |
| 250 c1_m = __msa_ilvev_h(k4_m, k3_m); \ |
| 251 \ |
| 252 tmp2_m = __msa_dotp_s_w(res0_m, c1_m); \ |
| 253 res1_m = __msa_ilvr_h((in0), (in2)); \ |
| 254 c1_m = __msa_ilvev_h(k1_m, zero_m); \ |
| 255 \ |
| 256 tmp3_m = __msa_dotp_s_w(res1_m, c1_m); \ |
| 257 int3_m += tmp2_m; \ |
| 258 int3_m += tmp3_m; \ |
| 259 \ |
| 260 SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, DCT_CONST_BITS); \ |
| 261 PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1); \ |
| 262 PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3); \ |
| 263 } |
| 264 |
| 265 #define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) ({ \ |
| 266 v8i16 c0_m, c1_m; \ |
| 267 \ |
| 268 SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m); \ |
| 269 c0_m = __msa_ilvev_h(c1_m, c0_m); \ |
| 270 \ |
| 271 c0_m; \ |
| 272 }) |
| 273 |
| 274 /* multiply and add macro */ |
| 275 #define VP9_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, \ |
| 276 out0, out1, out2, out3) { \ |
| 277 v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ |
| 278 v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ |
| 279 \ |
| 280 ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m); \ |
| 281 ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m); \ |
| 282 DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, \ |
| 283 cst0, cst0, cst1, cst1, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ |
| 284 SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ |
| 285 PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1); \ |
| 286 DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, \ |
| 287 cst2, cst2, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ |
| 288 SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ |
| 289 PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3); \ |
| 290 } |
| 291 |
| 292 /* idct 8x8 macro */ |
| 293 #define VP9_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 294 out0, out1, out2, out3, out4, out5, out6, out7) { \ |
| 295 v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m; \ |
| 296 v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m; \ |
| 297 v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ |
| 298 v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64, \ |
| 299 cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 }; \ |
| 300 \ |
| 301 k0_m = VP9_SET_CONST_PAIR(mask_m, 0, 5); \ |
| 302 k1_m = VP9_SET_CONST_PAIR(mask_m, 1, 0); \ |
| 303 k2_m = VP9_SET_CONST_PAIR(mask_m, 6, 3); \ |
| 304 k3_m = VP9_SET_CONST_PAIR(mask_m, 3, 2); \ |
| 305 VP9_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5); \ |
| 306 SUB2(in1, in3, in7, in5, res0_m, res1_m); \ |
| 307 k0_m = VP9_SET_CONST_PAIR(mask_m, 4, 7); \ |
| 308 k1_m = __msa_splati_h(mask_m, 4); \ |
| 309 \ |
| 310 ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m); \ |
| 311 DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m, \ |
| 312 tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ |
| 313 SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ |
| 314 tp4_m = in1 + in3; \ |
| 315 PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m); \ |
| 316 tp7_m = in7 + in5; \ |
| 317 k2_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ |
| 318 k3_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ |
| 319 VP9_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m, \ |
| 320 in0, in4, in2, in6); \ |
| 321 BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m); \ |
| 322 BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m, \ |
| 323 out0, out1, out2, out3, out4, out5, out6, out7); \ |
| 324 } |
| 325 |
| 326 #define VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, \ |
| 327 out0, out1, out2, out3, out4, out5, out6, out7) { \ |
| 328 v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m; \ |
| 329 v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m; \ |
| 330 v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1; \ |
| 331 v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64, \ |
| 332 cospi_10_64, cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 }; \ |
| 333 v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64, \ |
| 334 cospi_6_64, -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 }; \ |
| 335 v8i16 mask3_m = { -cospi_24_64, cospi_8_64, cospi_16_64, \ |
| 336 -cospi_16_64, 0, 0, 0, 0 }; \ |
| 337 \ |
| 338 k0_m = VP9_SET_CONST_PAIR(mask1_m, 0, 1); \ |
| 339 k1_m = VP9_SET_CONST_PAIR(mask1_m, 1, 2); \ |
| 340 ILVRL_H2_SH(in1, in0, in_s1, in_s0); \ |
| 341 DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ |
| 342 r0_m, r1_m, r2_m, r3_m); \ |
| 343 k0_m = VP9_SET_CONST_PAIR(mask1_m, 6, 7); \ |
| 344 k1_m = VP9_SET_CONST_PAIR(mask2_m, 0, 1); \ |
| 345 ILVRL_H2_SH(in5, in4, in_s1, in_s0); \ |
| 346 DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ |
| 347 r4_m, r5_m, r6_m, r7_m); \ |
| 348 ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \ |
| 349 m0_m, m1_m, m2_m, m3_m); \ |
| 350 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ |
| 351 PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m); \ |
| 352 SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \ |
| 353 m0_m, m1_m, m2_m, m3_m); \ |
| 354 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ |
| 355 PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m); \ |
| 356 k0_m = VP9_SET_CONST_PAIR(mask1_m, 3, 4); \ |
| 357 k1_m = VP9_SET_CONST_PAIR(mask1_m, 4, 5); \ |
| 358 ILVRL_H2_SH(in3, in2, in_s1, in_s0); \ |
| 359 DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ |
| 360 r0_m, r1_m, r2_m, r3_m); \ |
| 361 k0_m = VP9_SET_CONST_PAIR(mask2_m, 2, 3); \ |
| 362 k1_m = VP9_SET_CONST_PAIR(mask2_m, 3, 4); \ |
| 363 ILVRL_H2_SH(in7, in6, in_s1, in_s0); \ |
| 364 DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ |
| 365 r4_m, r5_m, r6_m, r7_m); \ |
| 366 ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \ |
| 367 m0_m, m1_m, m2_m, m3_m); \ |
| 368 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ |
| 369 PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m); \ |
| 370 SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \ |
| 371 m0_m, m1_m, m2_m, m3_m); \ |
| 372 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ |
| 373 PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m); \ |
| 374 ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m); \ |
| 375 BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3); \ |
| 376 k0_m = VP9_SET_CONST_PAIR(mask2_m, 5, 6); \ |
| 377 k1_m = VP9_SET_CONST_PAIR(mask2_m, 6, 7); \ |
| 378 ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0); \ |
| 379 DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ |
| 380 r0_m, r1_m, r2_m, r3_m); \ |
| 381 k1_m = VP9_SET_CONST_PAIR(mask3_m, 0, 1); \ |
| 382 DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, \ |
| 383 r4_m, r5_m, r6_m, r7_m); \ |
| 384 ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, \ |
| 385 m0_m, m1_m, m2_m, m3_m); \ |
| 386 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ |
| 387 PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6); \ |
| 388 SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, \ |
| 389 m0_m, m1_m, m2_m, m3_m); \ |
| 390 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ |
| 391 PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5); \ |
| 392 k0_m = VP9_SET_CONST_PAIR(mask3_m, 2, 2); \ |
| 393 k1_m = VP9_SET_CONST_PAIR(mask3_m, 2, 3); \ |
| 394 ILVRL_H2_SH(in4, in3, in_s1, in_s0); \ |
| 395 DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ |
| 396 m0_m, m1_m, m2_m, m3_m); \ |
| 397 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ |
| 398 PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4); \ |
| 399 ILVRL_H2_SW(in5, in2, m2_m, m3_m); \ |
| 400 DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, \ |
| 401 m0_m, m1_m, m2_m, m3_m); \ |
| 402 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ |
| 403 PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5); \ |
| 404 \ |
| 405 out1 = -in1; \ |
| 406 out3 = -in3; \ |
| 407 out5 = -in5; \ |
| 408 out7 = -in7; \ |
| 409 } |
| 410 |
| 411 #define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8, \ |
| 412 r9, r10, r11, r12, r13, r14, r15, \ |
| 413 out0, out1, out2, out3, out4, out5, \ |
| 414 out6, out7, out8, out9, out10, out11, \ |
| 415 out12, out13, out14, out15) { \ |
| 416 v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m; \ |
| 417 v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m; \ |
| 418 v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m; \ |
| 419 v8i16 h8_m, h9_m, h10_m, h11_m; \ |
| 420 v8i16 k0_m, k1_m, k2_m, k3_m; \ |
| 421 \ |
| 422 /* stage 1 */ \ |
| 423 k0_m = VP9_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); \ |
| 424 k1_m = VP9_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); \ |
| 425 k2_m = VP9_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); \ |
| 426 k3_m = VP9_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); \ |
| 427 VP9_MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, \ |
| 428 g0_m, g1_m, g2_m, g3_m); \ |
| 429 k0_m = VP9_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); \ |
| 430 k1_m = VP9_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); \ |
| 431 k2_m = VP9_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); \ |
| 432 k3_m = VP9_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); \ |
| 433 VP9_MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, \ |
| 434 g4_m, g5_m, g6_m, g7_m); \ |
| 435 k0_m = VP9_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); \ |
| 436 k1_m = VP9_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); \ |
| 437 k2_m = VP9_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); \ |
| 438 k3_m = VP9_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); \ |
| 439 VP9_MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, \ |
| 440 g8_m, g9_m, g10_m, g11_m); \ |
| 441 k0_m = VP9_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); \ |
| 442 k1_m = VP9_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); \ |
| 443 k2_m = VP9_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); \ |
| 444 k3_m = VP9_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); \ |
| 445 VP9_MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, \ |
| 446 g12_m, g13_m, g14_m, g15_m); \ |
| 447 \ |
| 448 /* stage 2 */ \ |
| 449 k0_m = VP9_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); \ |
| 450 k1_m = VP9_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); \ |
| 451 k2_m = VP9_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); \ |
| 452 VP9_MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, \ |
| 453 h0_m, h1_m, h2_m, h3_m); \ |
| 454 k0_m = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); \ |
| 455 k1_m = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); \ |
| 456 k2_m = VP9_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); \ |
| 457 VP9_MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, \ |
| 458 h4_m, h5_m, h6_m, h7_m); \ |
| 459 BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10); \ |
| 460 BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, \ |
| 461 h8_m, h9_m, h10_m, h11_m, h6_m, h4_m, h2_m, h0_m); \ |
| 462 \ |
| 463 /* stage 3 */ \ |
| 464 BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m); \ |
| 465 k0_m = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ |
| 466 k1_m = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ |
| 467 k2_m = VP9_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); \ |
| 468 VP9_MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, \ |
| 469 out4, out6, out5, out7); \ |
| 470 VP9_MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, \ |
| 471 out12, out14, out13, out15); \ |
| 472 \ |
| 473 /* stage 4 */ \ |
| 474 k0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \ |
| 475 k1_m = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); \ |
| 476 k2_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \ |
| 477 k3_m = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); \ |
| 478 VP9_MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3); \ |
| 479 VP9_MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7); \ |
| 480 VP9_MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11); \ |
| 481 VP9_MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15); \ |
| 482 } |
| 483 #endif /* VP9_COMMON_MIPS_MSA_VP9_IDCT_MSA_H_ */ |
OLD | NEW |